/*
clone() - a slightly bent fork()
------------------------------------
clone code by Patrick Schaaf <bof@wg.saar.de>


clone() has its own entry in the syscall tables, but runs sys_fork().
This is found in kernel/fork.c

The syscall expects the following parameters:
  %eax		__NR_clone	(the syscall number)
  %ebx		stack pointer	(where the child stack should be)
  %ecx		clone flags

OR some of the following constants together to build the flags argument:

  COPYVM
    page tables will be copied to the new process, separating the address
    spaces of parent and child.  This is forced for apparent reasons when
    the stack pointer for the child is the same as for the parent, and it is
    the default for the real fork().

    If COPYVM is not set (this would be normal for a thread implementation),
    mm/memory.c:clone_page_tables() sets up sharing the page directory
    between parent and child.  This should mean that any changes in memory
    mapping in the child or parent will affect the other process, i.e. mmap()
    might work.  I wonder who uses those vm_area thingies.  It looks like
    ->stk_vma is not ok for the clone (a copy of the parents area).  The clone
    has to ensure there is memory at its own stack pointer, probably
    using mmap() for anonymous memory, but it will still get the self-growing
    stack behaviour in the original stk_vma.  I don't want to think about
    what this means for consistency.  When allocating the stack, remember
    those thingies grow towards lower addresses.

    On process exit, if the page directory is still shared with someone,
    the reference is decremented (mm/memory.c:free_page_tables()); the
    tables are left intact until the last decrement.

    It even looks like you can have any of your clones or the parent
    use exec(); that uses mm/memory.c:clear_page_tables(), which does
    the right thing to the page directory.

  COPYFD
    usually, a fork() increments the reference count of the 'struct file'
    for all open file descriptors, parent and child thus share file offset
    and flags.  When COPYFD is given, the child receives duplicates of
    the 'struct file's sharing only the inode (as if they had been
    open()ed independently).
    
  COPYPID
    usually, fork() gives PID of the child as incremented 16-bit number.
    If 32767 is reached, it wraps around to 1 (skips 0. 0 is scheduler).
    When COPYPID is given, the child gets parent's lower 16 bits of the
    PID copied, and upper 16 bits start from 1 and follow the wraparound
    scheme. Example when COPYFD is given:
    Parent process, PID = 128 = 0x80 is running.
    clone children will therefore receive PID 0x10080, 0x20080, ...
    There is some support for 32-bit in /proc filesystem and patch for
    kmem-ps to show. 

COPYVM, COPYFD and COPYPID come from <linux/sched.h>

You additionally OR a signal number into the flags, which will be sent to the
parent on child exit instead of the usual SIGCHLD.
If I have the parent die before the clones are dead, this might be a way for
normal user processes to confuse the heck out of init.  Whoops, this is
handled in kernel/exit.c:notify_parent().  It is reassuring to find all
nasty things you can think of already handled properly :-)

Besides the sharing mentioned, the clones are independent processes.
When one of them opens a file, nobody else sees it.  When one of them closes
a file, the others keep it open.  Maybe you can pass newly opened files via
Unix domain sockets or SysV message queues - is this implemented?

Also not shared is the SysV IPC stuff, and a lot of Posix process
attributes (pids, pgrps, sessions, controlling ttys, whatever).
This especially means that all clones are scheduled indepentanty,
can run with varying persona, and block without interference.


Related to clones is kernel/exit.c:sys_wait4().  You can OR __WCLONE
into the options parameter to modify wait4() behaviour as follows:

  exit signal SIGCHLD for the child (normal fork() case)
    __WCLONE given:
      does not wait on that child
    __WCLONE not given:
      waits
  exit signal not SIGCHLD (child was created by clone())
    __WCLONE given:
      waits
    __WCLONE not given:
      does not wait on that child

Looks like if you want to manage clone() termination, you should choose
an exit signal other than SIGCHLD, and use wait4(..,..,__WCLONE,..) to
wait for the clone()s only, and wait4(..,..,0,..) to wait for normally
fork()ed processes only.

__WCLONE comes from <linux/wait.h>


clone()'d processes are rather heavyweight for threads, and might not be
appropriate for the threading you want to do.  I do not want to say they
are useless, though; maybe a two-level approach to threading (I hear
that Sun does it) works for all cases (implement user-level threading
within each clone).


Whoever managed to follow me up to here should be rewarded; here is a
an working example for using clone():
*/

#include <errno.h>
#include <stdio.h>
#include <signal.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/unistd.h>

/* NOTE: the clone() starts up on a new stack. Thus, we cannot return
 * from this function in the clone (we would have to copy stack frames
 * for main and the call). Instead, we call the given function, and
 * call _exit() with whatever it returns.
 */

#define STR(x) #x
#define DEREF_STR(x) STR(x)

int do_clone(unsigned long esp, unsigned long flags, int (*func)(void))
{ int ret;

  __asm__ (
	   "movl $" DEREF_STR(__NR_clone) ", %%eax\n\t"
	   "int $0x80\n\t"
	   /* error? */
	   "jnc 1f\n\t"
	   /* error. */
	   "movl %%eax, _errno\n\t"
	   "movl $-1, %0\n\t"
	   "jmp 3f\n\t"
	   "1:\n\t"
	   "testl %0, %0\n\t"
	   "jne 3f\n\t"
	   /* the clone */
	   "call *%3\n\t"
	   "pushl $0\n\t"
	   "call _exit\n\t"
	   /* not reached */
           "1:\n\t"
	   "jmp 1b\n\t"
	   /* the parent */
	   "3:\n\t"
	   : "=a" (ret)
	   : "b" (esp),		/* input goes into the correct */
	     "c" (flags),       /* registers for the syscall.  */
	     "d" (func)
  );
  return ret;
}

int clone_pid = -1;
int do_terminate = 0;

/* we use sigusr1() for child termination signalling */
void sigusr1(int sig)
{ unsigned long status;
  int pid;

  printf("parent: got SIGUSR1, waiting for children... clone_pid=%d\n",
         clone_pid);
  pid = wait4(clone_pid, &status, __WCLONE, (struct rusage *)0);
  if (pid < 0) {
    perror("wait4");
    return;
  }
  printf("parent: wait4 returned %d\n", pid);
  if (clone_pid == pid)
    do_terminate = 1;
  signal(SIGUSR1, sigusr1);
  return;
}

char clone_stack[4*4096];

int clone_function(void)
{
  clone_pid = getpid();
  fprintf(stderr, "clone running, pid = %d\n", clone_pid);
  sleep(5);
  fprintf(stderr, "clone terminating\n");
  return 0;
}

int main(int argc, char **argv)
{ int pid;

  printf("parent pid = %d\n", getpid());

  signal(SIGUSR1, sigusr1);

  pid = do_clone((unsigned long)(clone_stack+sizeof(clone_stack)-1),
                 SIGUSR1,
		 clone_function);

  if (pid < 0)
    perror("clone");
  else if (pid == 0)
    fprintf(stderr, "funny, clone() returned pid=0 in parent\n");
  else {
    printf("parent: clone running, pid = %d. waiting for termination.\n", pid);
    while (!do_terminate) {
      printf("parent: child did not signal termination, yet.\n");
      sleep(1);
    }
    printf("parent: looks like our kid is gone. BTW, clone_pid = %d\n", clone_pid);
  }
  return 0;
}

