This is a very simple example on how instruction reordering can mess up a program with multiple threads. Consider the following example:
#include <pthread.h>
#include <stdio.h>
int shared_data;
int flag;
void *thread1_func(void *arg) {
printf("Thread 1: Starting...\n");
shared_data = 42;
flag = 1;
printf("Thread 1: Data set to %d, Flag set to %d\n", shared_data, flag);
return NULL;
}
void *thread2_func(void *arg) {
printf("Thread 2: Starting...\n");
while (flag == 0) {
// a simple spinlock
;
}
printf("Thread 2: Flag is set! Reading shared_data: %d\n", shared_data);
if (shared_data != 42) {
printf("Thread 2: ERROR! Expected shared_data to be 42, but got %d\n",
shared_data);
printf("Thread 2: Instruction reordering likely caused this issue!\n");
} else {
printf("Thread 2: Success! Shared data is as expected.\n");
}
return NULL;
}
int main() {
pthread_t thread1, thread2;
shared_data = 0; // Initialize shared data
flag = 0; // Initialize flag
printf("Main: Creating threads...\n");
pthread_create(&thread1, NULL, thread1_func, NULL);
pthread_create(&thread2, NULL, thread2_func, NULL);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
printf("Main: Threads finished.\n");
return 0;
}
Now in order for reordering to happen, let’s compile this with O2
.
clang -O2 -o program program.c
Now let’s run this executable 10000
times using a python script and check for failures:
import subprocess
num_runs = 1000
executable = "program"
failure_count = 0
success_count = 0
print(f"\nRunning the program {num_runs} times to check for failures...")
for i in range(num_runs):
run_command = ["./" + executable]
run_process = subprocess.run(run_command, capture_output=True, text=True)
output = run_process.stdout
if "Thread 2: ERROR!" in output:
failure_count += 1
print(
f"Run {i+1}: FAILURE DETECTED! stdout: {output}"
)
else:
success_count += 1
print("\n--- Summary ---")
print(f"Total runs: {num_runs}")
print(f"Successes: {success_count}")
print(f"Failures: {failure_count}")
if failure_count > 0:
print(
"\nFAILURES DETECTED! Instruction reordering likely caused issues in some runs."
)
print("This demonstrates the race condition in the C program.")
else:
print("\nNo failures detected in these runs.")
On my machine running the script produces the following:
Hedwig :: /tmp » python3.11 b.py
Running the program 1000 times to check for failures...
Run 418: FAILURE DETECTED! stdout: Main: Creating threads...
Thread 2: Starting...
Thread 2: Flag is set! Reading shared_data: 0
Thread 2: ERROR! Expected shared_data to be 42, but got 0
Thread 2: Instruction reordering likely caused this issue!
Thread 1: Starting...
Thread 1: Data set to 42, Flag set to 1
Main: Threads finished.
Run 426: FAILURE DETECTED! stdout: Main: Creating threads...
Thread 2: Starting...
Thread 2: Flag is set! Reading shared_data: 0
Thread 2: ERROR! Expected shared_data to be 42, but got 0
Thread 2: Instruction reordering likely caused this issue!
Thread 1: Starting...
Thread 1: Data set to 42, Flag set to 1
Main: Threads finished.
Run 444: FAILURE DETECTED! stdout: Main: Creating threads...
Thread 2: Starting...
Thread 2: Flag is set! Reading shared_data: 0
Thread 2: ERROR! Expected shared_data to be 42, but got 0
Thread 2: Instruction reordering likely caused this issue!
Thread 1: Starting...
Thread 1: Data set to 42, Flag set to 1
Main: Threads finished.
--- Summary ---
Total runs: 1000
Successes: 997
Failures: 3
Now lets add a compiler barrier and rerun the script, but this time for 10000
times.
#include <pthread.h>
#include <stdio.h>
int shared_data;
int flag;
#define compiler_barrier() asm volatile("dmb ish" : : : "memory")
void *thread1_func(void *arg) {
printf("Thread 1: Starting...\n");
compiler_barrier();
shared_data = 42;
compiler_barrier();
flag = 1;
printf("Thread 1: Data set to %d, Flag set to %d\n", shared_data, flag);
return NULL;
}
void *thread2_func(void *arg) {
printf("Thread 2: Starting...\n");
while (flag == 0) {
;
}
printf("Thread 2: Flag is set! Reading shared_data: %d\n", shared_data);
if (shared_data != 42) {
printf("Thread 2: ERROR! Expected shared_data to be 42, but got %d\n",
shared_data);
printf("Thread 2: Instruction reordering likely caused this issue!\n");
} else {
printf("Thread 2: Success! Shared data is as expected.\n");
}
return NULL;
}
int main() {
pthread_t thread1, thread2;
shared_data = 0;
flag = 0;
printf("Main: Creating threads...\n");
pthread_create(&thread1, NULL, thread1_func, NULL);
pthread_create(&thread2, NULL, thread2_func, NULL);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
printf("Main: Threads finished.\n");
return 0;
}
This still fails a couple of times. The final trick is to make the flag variable volatile, since the snippet:
while (flag == 0){
}
needs to access the flag from memory always (since the other thread may change the value abruptly). Thus the compiler can’t make any assumptions about this variable neither can it cache it in a register and keep accessing it from there.
Just to illustrate this a little better, this is the diff between the instructions generated when flag is non volatile and volatile:
When flag is volatile:
100003d08: b0000028 adrp x8, 0x100008000 <_flag>
100003d0c: 91000108 add x8, x8, #0x0
100003d10: b9400109 ldr w9, [x8]
100003d14: 34ffffe9 cbz w9, 0x100003d10 <_thread2_func+0x24>
First the address of the page with the _flag
variable is loaded to x8
(adrp). Then the offset of the _flag
variable is added (add instruction). Then the value in the address x8
(value of _flag
from RAM) is loaded to w9
. If the value in w9
is not zero we go back to ldr
and check if the value of _flag
in memory has changed.
For the non volatile case, the assembly for the while loop is:
Yes this is not a typo, the compiler optimized the while loop away. The compiler does so because in the context of thread 2, flag is always zero and is never mutated (Interprocedural constant propogation takes place I would assume).
#include <pthread.h>
#include <stdio.h>
int shared_data;
volatile int flag;
#define compiler_barrier() asm volatile("dmb ish" : : : "memory")
void *thread1_func(void *arg) {
printf("Thread 1: Starting...\n");
compiler_barrier();
shared_data = 42;
compiler_barrier();
flag = 1;
printf("Thread 1: Data set to %d, Flag set to %d\n", shared_data, flag);
return NULL;
}
void *thread2_func(void *arg) {
printf("Thread 2: Starting...\n");
while (flag == 0) {
;
}
printf("Thread 2: Flag is set! Reading shared_data: %d\n", shared_data);
if (shared_data != 42) {
printf("Thread 2: ERROR! Expected shared_data to be 42, but got %d\n",
shared_data);
printf("Thread 2: Instruction reordering likely caused this issue!\n");
} else {
printf("Thread 2: Success! Shared data is as expected.\n");
}
return NULL;
}
int main() {
pthread_t thread1, thread2;
shared_data = 0;
flag = 0;
printf("Main: Creating threads...\n");
pthread_create(&thread1, NULL, thread1_func, NULL);
pthread_create(&thread2, NULL, thread2_func, NULL);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
printf("Main: Threads finished.\n");
return 0;
}
And now we get the following message:
Running the program b for 10000 times to check for failures...
--- Summary ---
Total runs: 10000
Successes: 10000
Failures: 0