the magic of linux clone

Processes and threads are concepts that have been used across many operating systems and programming languages. While we tend to think of ‘process’ as a heavy thing and ’thread’ as a light thing, how they are implemented and handled depends on context. This article will focus on how the Linux operating system handles these concepts and the mechanisms used.

Creating a new process

Generally in Linux if you want to create a new process in C you can do the following:

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>

int main()
{
	pid_t pid;
	pid = fork();

	if (pid < 0) {
		exit(-1);
	}

	if (pid == 0) {
		pid_t tid = syscall(__NR_gettid);
		printf("child: pid:%ld tid:%ld\n", (long)getpid(), (long)tid);
	} else {
		pid_t tid = syscall(__NR_gettid);
		printf("parent: pid:%ld tid:%ld\n", (long)getpid(), (long)tid);
	}
	return 0;
}

Output from the program:

child: pid:9154 tid:9154
parent: pid:9153 tid:9153

Notice that process and thread ids have changed and that pid == tid in this model. Now if you strace the program you will see the following:

execve("./a.out", ["./a.out"], 0x7fff319c16d0 /* 61 vars */) = 0
brk(NULL)                               = 0x557cdc5e3000
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=99382, ...}) = 0
mmap(NULL, 99382, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f2719d5e000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\260\34\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2030544, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f2719d5c000
mmap(NULL, 4131552, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f271975f000
mprotect(0x7f2719946000, 2097152, PROT_NONE) = 0
mmap(0x7f2719b46000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1e7000) = 0x7f2719b46000
mmap(0x7f2719b4c000, 15072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f2719b4c000
close(3)                                = 0
arch_prctl(ARCH_SET_FS, 0x7f2719d5d4c0) = 0
mprotect(0x7f2719b46000, 16384, PROT_READ) = 0
mprotect(0x557cdb1d6000, 4096, PROT_READ) = 0
mprotect(0x7f2719d77000, 4096, PROT_READ) = 0
munmap(0x7f2719d5e000, 99382)           = 0
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f2719d5d790) = 9154
gettid()                                = 9153
getpid()                                = 9153
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 2), ...}) = 0
brk(NULL)                               = 0x557cdc5e3000
brk(0x557cdc604000)                     = 0x557cdc604000
write(1, "parent: pid:9153 tid:9153\n", 26) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=9154, si_uid=1000, si_status=0, si_utime=0, si_stime=0} ---
write(1, "parent: pid:9153 tid:9153\n", 26) = 26
exit_group(0)                           = ?
+++ exited with 0 +++

Note the lack of fork in the above trace. In Linux and specifically programs that use glibc will utilize the clone system call with specific flags (CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD) to accomplish a fork.

Creating a new thread

The following program creates two threads and also prints information about process id and thread id for the newly created threads and even the main thread.

#include <pthread.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>

void *thread_function(void *ptr)
{
	pid_t tid = syscall(__NR_gettid);
	printf("thread:#%d pid:%ld tid:%ld\n", (int)ptr, (long)getpid(),
	       (long)tid);
	return ptr;
}

int main(int argc, char **argv)
{
	pthread_t thread1, thread2;
	pthread_create(&thread1, NULL, *thread_function, (void *)1);
	pthread_create(&thread2, NULL, *thread_function, (void *)2);

	/* main thread */
	pid_t tid = syscall(__NR_gettid);
	printf("thread:#%d pid:%ld tid:%ld\n", (int)0, (long)getpid(),
	       (long)tid);

	pthread_join(thread1, NULL);
	pthread_join(thread2, NULL);
	return 0;
}

Example output from this program:

thread:#1 pid:7918 tid:7919
thread:#0 pid:7918 tid:7918
thread:#2 pid:7918 tid:7920

Notice that the process ID remains the same, while the thread ID changes.

Strace output from the program:

execve("./a.out", ["./a.out"], 0x7fff882a00d0 /* 61 vars */) = 0
brk(NULL)                               = 0x55f25c150000
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=99382, ...}) = 0
mmap(NULL, 99382, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f8a74eda000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0000b\0\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=144976, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8a74ed8000
mmap(NULL, 2221184, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f8a74aad000
mprotect(0x7f8a74ac7000, 2093056, PROT_NONE) = 0
mmap(0x7f8a74cc6000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x19000) = 0x7f8a74cc6000
mmap(0x7f8a74cc8000, 13440, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f8a74cc8000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\260\34\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2030544, ...}) = 0
mmap(NULL, 4131552, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f8a746bc000
mprotect(0x7f8a748a3000, 2097152, PROT_NONE) = 0
mmap(0x7f8a74aa3000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1e7000) = 0x7f8a74aa3000
mmap(0x7f8a74aa9000, 15072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f8a74aa9000
close(3)                                = 0
mmap(NULL, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f8a74ed5000
arch_prctl(ARCH_SET_FS, 0x7f8a74ed5740) = 0
mprotect(0x7f8a74aa3000, 16384, PROT_READ) = 0
mprotect(0x7f8a74cc6000, 4096, PROT_READ) = 0
mprotect(0x55f25a1c7000, 4096, PROT_READ) = 0
mprotect(0x7f8a74ef3000, 4096, PROT_READ) = 0
munmap(0x7f8a74eda000, 99382)           = 0
set_tid_address(0x7f8a74ed5a10)         = 9270
set_robust_list(0x7f8a74ed5a20, 24)     = 0
rt_sigaction(SIGRTMIN, {sa_handler=0x7f8a74ab2cb0, sa_mask=[], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7f8a74abf890}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {sa_handler=0x7f8a74ab2d50, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0x7f8a74abf890}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
prlimit64(0, RLIMIT_STACK, NULL, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f8a73ebb000
mprotect(0x7f8a73ebc000, 8388608, PROT_READ|PROT_WRITE) = 0
brk(NULL)                               = 0x55f25c150000
brk(0x55f25c171000)                     = 0x55f25c171000
clone(child_stack=0x7f8a746bafb0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7f8a746bb9d0, tls=0x7f8a746bb700, child_tidptr=0x7f8a746bb9d0) = 9271
mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f8a736ba000
mprotect(0x7f8a736bb000, 8388608, PROT_READ|PROT_WRITE) = 0
clone(child_stack=0x7f8a73eb9fb0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7f8a73eba9d0, tls=0x7f8a73eba700, child_tidptr=0x7f8a73eba9d0) = 9272
gettid()                                = 9270
getpid()                                = 9270
write(1, "thread:#0 pid:9270 tid:9270\n", 28) = 28
exit_group(0)                           = ?
+++ exited with 0 +++

Here note two calls to clone. This time with different flags: CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID.

One flag in particular stands out which is CLONE_THREAD. Now, let’s look more into the clone system call.

The clone system call

The description for clone says: create a child process. In addition looking more into the man page for this call we notice that there are many flags that can be used for different behaviors. Above we saw that fork and pthread_create just use clone with different flags.

A summary of the flags (as copied from the Linux kernel sources):

#define CLONE_VM    0x00000100	/* set if VM shared between processes */
#define CLONE_FS    0x00000200	/* set if fs info shared between processes */
#define CLONE_FILES 0x00000400	/* set if open files shared between processes */
#define CLONE_SIGHAND   0x00000800	/* set if signal handlers and blocked signals shared */
#define CLONE_PTRACE    0x00002000	/* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000	/* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT    0x00008000	/* set if we want to have the same parent as the cloner */
#define CLONE_THREAD    0x00010000	/* Same thread group? */
#define CLONE_NEWNS 0x00020000	/* New mount namespace group */
#define CLONE_SYSVSEM   0x00040000	/* share system V SEM_UNDO semantics */
#define CLONE_SETTLS    0x00080000	/* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000	/* set the TID in the parent */
#define CLONE_CHILD_CLEARTID    0x00200000	/* clear the TID in the child */
#define CLONE_DETACHED      0x00400000	/* Unused, ignored */
#define CLONE_UNTRACED      0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID  0x01000000	/* set the TID in the child */
#define CLONE_NEWUTS        0x04000000	/* New utsname namespace */
#define CLONE_NEWIPC        0x08000000	/* New ipc namespace */
#define CLONE_NEWUSER       0x10000000	/* New user namespace */
#define CLONE_NEWPID        0x20000000	/* New pid namespace */
#define CLONE_NEWNET        0x40000000	/* New network namespace */
#define CLONE_IO        0x80000000	/* Clone io context */

In the “fork” case above we see only flags specifying clearing and settings of TID are expressed. Thus, the pids and tid are different.

In the above “thread” case we see that virtual memory, files, open files, signal handlers are shared. In addition CLONE_THREAD is used which does the following:

CLONE_THREAD (since Linux 2.4.0-test8)
If CLONE_THREAD is set, the child is placed in the same thread group as the
calling process.
Thread groups were a feature added in Linux 2.4 to support the POSIX threads
notion of a set of threads that share a single PID. Internally, this shared PID
is the so-called thread group identifier (TGID) for the thread group. Since
Linux 2.4, calls to getpid(2) return the TGID of the caller.

So this case makes since because the pid is the same; but the tid changes. However the language is confusing; the tgid in the kernel actually maps to the pid in userspace, and the pid in kernel space maps to the tid in userspace!