1. Multithreaded Hello World
Write two versions of a multithreaded “Hello World” program in which every thread prints a message like:
Hello, I'm thread number 3 of 5 threads
(a)Use Pthread as described in the lecture. The number of threads should be an argument to the program (e.g., ./pthread_hello 4
should create 4 threads). A skeleton and a Makefile are available for download.
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
int nthreads;
void *perform_work(void *args) {
int thread_index = *((int *)args);
printf("Hello, I'm thread number %d of %d threads\n", thread_index, nthreads);
return NULL;
}
int main(int argc, char **argv) {
if (argc < 2) {
return EXIT_FAILURE;
} else {
nthreads = atoi(argv[1]);
}
// allocate nthreads * sizeof(pthread_t), ….
pthread_t *threads = (pthread_t *)malloc(sizeof(pthread_t) * nthreads);
int *thread_indices = (int *)malloc(sizeof(int) * nthreads);
// create threads…
for (int i = 0; i < nthreads; i++) {
thread_indices[i] = i;
pthread_create(&threads[i], NULL, perform_work, &thread_indices[i]);
}
// join threads …
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
}
free(threads);
free(thread_indices);
return EXIT_SUCCESS;
}
- Parse the argument (
argv[1]
) to determine the number of threads (nthreads
). - Allocate memory for the
pthread_t
array andthread_indices
. - Create threads:
- Each thread is assigned a unique index via
thread_indices
. - Why
thread_indices
is important and cannot beNULL
:- The
perform_work
function requires a unique identifier (thread_index
) for each thread. pthread_create
passes a pointer (&thread_indices[i]
) toperform_work
, which allows each thread to know its specific index.- If
NULL
is passed instead, the functionperform_work
will receive aNULL
pointer, causing a Segmentation Fault when it attempts to dereference it (e.g.,*((int *)args)
). - The array
thread_indices
ensures that each thread operates on its dedicated index, avoiding undefined behavior or shared data issues.
- The
- Each thread is assigned a unique index via
- The main program waits with
pthread_join
until all threads are finished. - The program terminates cleanly (memory deallocation is missing and should be added).
(b) Implement an OpenMP version. You may use any compiler that supports OpenMP. On the SuperMUC-NG, both icc
and gcc
support OpenMP.
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int nthreads;
void *perform_work(void *args) {
int thread_index = *((int *)args);
printf("Hello, I'm thread number %d of %d threads\n", thread_index, nthreads);
return NULL;
}
int main(int argc, char **argv) {
if (argc < 2) {
return EXIT_FAILURE;
} else {
nthreads = atoi(argv[1]);
}
#pragma omp parallel for
// create threads…
for (int i = 0; i < nthreads; i++) {
int thread_num = omp_get_thread_num();
perform_work(&thread_num);
}
return EXIT_SUCCESS;
}
- Parse the argument (
argv[1]
) to determine the number of threads (nthreads
). - Use OpenMP’s
#pragma omp parallel for
to parallelize the loop. - Each thread executes the
perform_work
function with its unique thread index fromomp_get_thread_num
. - The program completes once all threads have finished.
(c) Modify both versions so that the messages are printed ordered by thread number.
Pthreads Ordered
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
int nthreads;
void *perform_work(void *args) {
int thread_index = *((int *)args);
printf("Hello, I'm thread number %d of %d threads\n", thread_index, nthreads);
return NULL;
}
int main(int argc, char **argv) {
if (argc < 2) {
return EXIT_FAILURE;
} else {
nthreads = atoi(argv[1]);
}
pthread_t *threads = (pthread_t *)malloc(sizeof(pthread_t) * nthreads);
int *thread_indices = (int *)malloc(sizeof(int) * nthreads);
for (int i = 0; i < nthreads; i++) {
thread_indices[i] = i;
pthread_create(&threads[i], NULL, perform_work, &thread_indices[i]);
pthread_join(threads[i], NULL);
}
free(threads);
free(thread_indices);
return EXIT_SUCCESS;
}
OpenMP Ordered
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int nthreads;
void *perform_work(int thread_index) {
printf("Hello, I'm thread number %d of %d threads\n", thread_index, nthreads);
return NULL;
}
int main(int argc, char **argv) {
if (argc < 2) {
return EXIT_FAILURE;
} else {
nthreads = atoi(argv[1]);
}
omp_set_num_threads(nthreads);
// Parallel execution of threads in order
#pragma omp parallel for ordered
for (int i = 0; i < nthreads; i++) {
#pragma omp ordered
perform_work(i);
}
return EXIT_SUCCESS;
}
2. Calculation of π as the Limit of a Series
The Leibniz formula is one of many methods to calculate π numerically:
(a) Write a sequential program that calculates π using the Leibniz formula. Approximate at least 8 decimal places.
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
static inline long long timestamp() {
struct timespec ts;
long long timestamp;
clock_gettime(CLOCK_REALTIME, &ts);
timestamp = ts.tv_sec * 1000000000LL + ts.tv_nsec;
return timestamp;
}
static double pi(int N) {
double result = 0;
for (int k = 0; k < N; k++) {
double numerator = (k % 2 == 0) ? 1.0 : -1.0; // (-1)^k
double denominator = (2 * k) + 1;
result += (numerator / denominator);
}
return result;
}
int main(int argc, char *argv[]) {
/** my implementation of the measurements **/
int N = 2000;
// Made the method take arguments cause why not
if (argc < 2) {
fprintf(stderr, "\nUsage: %s <N: Integer_for_precision> \n", argv[0]);
printf("Using default value 2000 for N\n\n");
} else {
if (strtol(argv[1], NULL, 10) == 0 || NULL) {
fprintf(stderr, "Only Integers are accepted as an argument. \n", NULL);
return EXIT_FAILURE;
} else {
N = strtol(argv[1], NULL, 10); // idk how cursed this is. Is this valid a
// way to change char to int?
}
}
const long long begin_timestamp = timestamp();
const double result_Leibniz_N = pi(N);
const long long end_timestamp = timestamp();
const long long time = end_timestamp - begin_timestamp;
printf("Leibniz: %f \n", result_Leibniz_N);
printf("Pi: %f\n", result_Leibniz_N * 4);
printf("Time needed: %lld\n", time);
return EXIT_SUCCESS;
}
(b) Parallelize the program using OpenMP and evaluate the speedup compared to the sequential version for an increasing number of threads. Go from one to the total number of cores available on the system.
#define _GNU_SOURCE
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
static inline long long timestamp() {
struct timespec ts;
long long timestamp;
clock_gettime(CLOCK_REALTIME, &ts);
timestamp = ts.tv_sec * 1000000000LL + ts.tv_nsec;
return timestamp;
}
static double pi(int N) {
/** my implementation of the Leibniz formula **/
double result = 0;
#pragma omp parallel for reduction(+ : result)
for (int k = 0; k < N; k++) {
double numerator = (k % 2 == 0) ? 1.0 : -1.0; // (-1)^k
double denominator = (2 * k) + 1;
result += (numerator / denominator);
}
return result;
}
int main(int argc, char *argv[]) {
/** my implementation of the measurements **/
int N = 2000;
// Made the method take arguments cause why not
if (argc < 2) {
fprintf(stderr, "\nUsage: %s N: <Integer_for_precision> \n", argv[0]);
printf("Using default value 2000 for N\n\n");
} else {
char *end;
long val = strtol(argv[1], &end, 10);
if (*end != '\0') {
fprintf(stderr, "Only integers are accepted as an argument.\n");
return EXIT_FAILURE;
} else {
N = (int)val;
}
}
const long long begin_timestamp = timestamp();
const double result_Leibniz_N = pi(N);
const long long end_timestamp = timestamp();
const long long time = end_timestamp - begin_timestamp;
printf("Leibniz: %f \n", result_Leibniz_N);
printf("Pi: %f\n", result_Leibniz_N * 4);
printf("Time needed: %lld ns\n", time);
return EXIT_SUCCESS;
}
Note:
$lscpu
or$cat /proc/cpuinfo
I was intrested in a comparison so I plotted it here are my results:
I used this script to calculate π using the Leibniz formula. The command I ran:
./pi_calculator 1000 5000000 1000000000
• 1000: Starting term. • 5000000: Step size. • 1000000000: End term.
This allowed me to analyze how the approximation improves with more terms in the series.
time_comparison_plot
speedup_plot
3. Parallel Prime Sieve
Write a program that calculates the number of primes between 1 and some upper limit , e.g., . Use the following snippet as a prime test:
bool isprime(int p)
{
for (int d = 2; d < p; ++d)
if (p % d == 0)
return false;
return true;
}
Experiment with OpenMP’s different loop scheduling options and evaluate their impact on the performance. You can either use the
schedule
clause or the OpenMP environment variableOMP_SCHEDULE
.
There’s an error: 1 is returned as prime, but that’s incorrect. A prime has exactly two divisors: 1 and itself. Since 1 has only one divisor, it is not prime. I changed this in my solution below
#define _GNU_SOURCE
#include <omp.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NMAX_DEFAULT 1000000
static inline long long timestamp() {
struct timespec ts;
long long timestamp;
clock_gettime(CLOCK_REALTIME, &ts);
timestamp = ts.tv_sec * 1000000000LL + ts.tv_nsec;
return timestamp;
}
static bool isprime(int p) {
if (p < 2)
return false; // 1 is not a prime. issue in skeleton code
for (int d = 2; d < p; ++d)
if (p % d == 0)
return false;
return true;
}
static int num_primes(int N) {
int prime_counter = 0;
#pragma omp parallel for reduction(+ : prime_counter)
for (int i = 0; i < N; i++) {
if (isprime(i)) {
prime_counter++;
}
}
return prime_counter;
}
int main(int argc, char *argv[]) {
int n_max = NMAX_DEFAULT;
int primes;
long long ts_start;
long long duration;
printf("Prime Sieve\n");
printf(" limit: %d\n", n_max);
if (argc >= 2) {
n_max = atoi(argv[1]);
}
ts_start = timestamp();
primes = num_primes(n_max);
duration = timestamp() - ts_start;
printf(" primes: %d time: %0.4f ms\n", primes, 1.0e-6 * duration);
return EXIT_SUCCESS;
}
Output
╰─ ./sieve 1000000
Prime Sieve
limit: 1000000
primes: 78498 time: 16284.0440 ms
Other Solutions and Benchmark
#define _GNU_SOURCE
#include <omp.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NMAX_DEFAULT 1000000
static inline long long timestamp() {
struct timespec ts;
long long timestamp;
clock_gettime(CLOCK_REALTIME, &ts);
timestamp = ts.tv_sec * 1000000000LL + ts.tv_nsec;
return timestamp;
}
static bool isprime(int p) {
if (p < 2)
return false; // 1 is not a prime
for (int d = 2; d < p; ++d)
if (p % d == 0)
return false;
return true;
}
// Serial implementation
static int num_primes_serial(int N) {
int prime_counter = 0;
for (int i = 0; i < N; i++) {
if (isprime(i)) {
prime_counter++;
}
}
return prime_counter;
}
// Parallel implementation with OpenMP
static int num_primes_parallel(int N, const char *schedule_type, int chunk_size) {
int prime_counter = 0;
if (schedule_type == NULL || chunk_size < 1) {
printf("Invalid scheduling parameters\n");
return -1;
}
#pragma omp parallel for schedule(runtime) reduction(+ : prime_counter)
for (int i = 0; i < N; i++) {
if (isprime(i)) {
prime_counter++;
}
}
return prime_counter;
}
int main(int argc, char *argv[]) {
int n_max = NMAX_DEFAULT;
int primes;
long long ts_start;
long long duration;
if (argc >= 2) {
n_max = atoi(argv[1]);
}
printf("Prime Sieve with Benchmarking\n");
printf(" limit: %d\n", n_max);
// Serial computation
ts_start = timestamp();
primes = num_primes_serial(n_max);
duration = timestamp() - ts_start;
printf(" [Serial] primes: %d time: %0.4f ms\n", primes, 1.0e-6 * duration);
// Static scheduling
omp_set_schedule(omp_sched_static, 100);
ts_start = timestamp();
primes = num_primes_parallel(n_max, "static", 100);
duration = timestamp() - ts_start;
printf(" [Static] primes: %d time: %0.4f ms\n", primes, 1.0e-6 * duration);
// Dynamic scheduling
omp_set_schedule(omp_sched_dynamic, 100);
ts_start = timestamp();
primes = num_primes_parallel(n_max, "dynamic", 100);
duration = timestamp() - ts_start;
printf(" [Dynamic] primes: %d time: %0.4f ms\n", primes, 1.0e-6 * duration);
// Guided scheduling
omp_set_schedule(omp_sched_guided, 100);
ts_start = timestamp();
primes = num_primes_parallel(n_max, "guided", 100);
duration = timestamp() - ts_start;
printf(" [Guided] primes: %d time: %0.4f ms\n", primes, 1.0e-6 * duration);
return EXIT_SUCCESS;
}
╰─ make benchmark
cc -Xpreprocessor -fopenmp prime_sieve_scheduling.c -o sieve-benchmark -L/opt/homebrew/opt/libomp/lib -lomp -I/opt/homebrew/opt/libomp/include
Running benchmark with N=1000000…
./sieve-benchmark 1000000
Prime Sieve with Benchmarking
limit: 1000000
[Serial] primes: 78498 time: 64279.6030 ms
[Static] primes: 78498 time: 11042.1070 ms
[Dynamic] primes: 78498 time: 11012.4070 ms
[Guided] primes: 78498 time: 11182.5510 ms
Formula for Speedup
The speedup is calculated as:
Explanation
This formula measures how many times faster a parallel execution () is compared to a serial execution (). A higher indicates better parallel performance.
4. Generating Partial Sums in Parallel
For a given array
A[0…N-1]
, a second arrayB[0…N-1]
with the prefix sum ofA
should be generated. The prefix sum for the -th element ofB
is simply defined as:
Where is the -th element of .
Parallelize the generation of the prefix sums (calculation of the elements of ) in the following program with OpenMP and test the correctness of your implementation.
#include <omp.h>
#include <stdio.h>
#define N 1000
int main(int argc, char* argv[])
{
int i;
int A[N];
int B[N];
for (i = 0; i < N; i++) {
A[i] = i;
}
B[0] = A[0];
// Parallelize:
for (i = 1; i < N; i++) {
B[i] = B[i - 1] + A[i];
}
for (i = 0; i < N; i++) {
printf("%d ", B[i]);
}
}
Solution
#include <omp.h>
#include <stdio.h>
#define N 1000
int main(int argc, char *argv[]) {
int i;
int A[N];
int B[N];
for (i = 0; i < N; i++) {
A[i] = i;
}
B[0] = A[0];
// Parallelize:
#pragma omp parallel
for (i = 1; i < N; i++) {
B[i] = B[i - 1] + A[i];
}
for (i = 0; i < N; i++) {
printf("%d ", B[i]);
}
}