1. Multithreaded Hello World

Write two versions of a multithreaded “Hello World” program in which every thread prints a message like:

Hello, I'm thread number 3 of 5 threads

(a)Use Pthread as described in the lecture. The number of threads should be an argument to the program (e.g., ./pthread_hello 4 should create 4 threads). A skeleton and a Makefile are available for download.

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
 
int nthreads;
 
void *perform_work(void *args) {
  int thread_index = *((int *)args);
  printf("Hello, I'm thread number %d of %d threads\n", thread_index, nthreads);
  return NULL;
}
 
int main(int argc, char **argv) {
  if (argc < 2) {
    return EXIT_FAILURE;
  } else {
    nthreads = atoi(argv[1]);
  }
 
  // allocate nthreads * sizeof(pthread_t), ….
  pthread_t *threads = (pthread_t *)malloc(sizeof(pthread_t) * nthreads);
  int *thread_indices = (int *)malloc(sizeof(int) * nthreads);
 
  // create threads…
  for (int i = 0; i < nthreads; i++) {
    thread_indices[i] = i;
    pthread_create(&threads[i], NULL, perform_work, &thread_indices[i]);
  }
  // join threads …
  for (int i = 0; i < nthreads; i++) {
    pthread_join(threads[i], NULL);
  }
free(threads);
free(thread_indices);
 
  return EXIT_SUCCESS;
}
  1. Parse the argument (argv[1]) to determine the number of threads (nthreads).
  2. Allocate memory for the pthread_t array and thread_indices.
  3. Create threads:
    • Each thread is assigned a unique index via thread_indices.
    • Why thread_indices is important and cannot be NULL:
      • The perform_work function requires a unique identifier (thread_index) for each thread.
      • pthread_create passes a pointer (&thread_indices[i]) to perform_work, which allows each thread to know its specific index.
      • If NULL is passed instead, the function perform_work will receive a NULL pointer, causing a Segmentation Fault when it attempts to dereference it (e.g., *((int *)args)).
      • The array thread_indices ensures that each thread operates on its dedicated index, avoiding undefined behavior or shared data issues.
  4. The main program waits with pthread_join until all threads are finished.
  5. The program terminates cleanly (memory deallocation is missing and should be added).

(b) Implement an OpenMP version. You may use any compiler that supports OpenMP. On the SuperMUC-NG, both icc and gcc support OpenMP.

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
 
int nthreads;
 
void *perform_work(void *args) {
  int thread_index = *((int *)args);
  printf("Hello, I'm thread number %d of %d threads\n", thread_index, nthreads);
  return NULL;
}
 
int main(int argc, char **argv) {
  if (argc < 2) {
    return EXIT_FAILURE;
  } else {
    nthreads = atoi(argv[1]);
  }
 
#pragma omp parallel for
  // create threads…
  for (int i = 0; i < nthreads; i++) {
    int thread_num = omp_get_thread_num();
    perform_work(&thread_num);
  }
 
  return EXIT_SUCCESS;
}
  1. Parse the argument (argv[1]) to determine the number of threads (nthreads).
  2. Use OpenMP’s #pragma omp parallel for to parallelize the loop.
  3. Each thread executes the perform_work function with its unique thread index from omp_get_thread_num.
  4. The program completes once all threads have finished.

(c) Modify both versions so that the messages are printed ordered by thread number.

Pthreads Ordered

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
 
int nthreads;
 
void *perform_work(void *args) {
  int thread_index = *((int *)args);
  printf("Hello, I'm thread number %d of %d threads\n", thread_index, nthreads);
  return NULL;
}
 
int main(int argc, char **argv) {
  if (argc < 2) {
    return EXIT_FAILURE;
  } else {
    nthreads = atoi(argv[1]);
  }
 
  pthread_t *threads = (pthread_t *)malloc(sizeof(pthread_t) * nthreads);
  int *thread_indices = (int *)malloc(sizeof(int) * nthreads);
 
  for (int i = 0; i < nthreads; i++) {
    thread_indices[i] = i;
    pthread_create(&threads[i], NULL, perform_work, &thread_indices[i]);
    pthread_join(threads[i], NULL);
  }
free(threads);
free(thread_indices);
  return EXIT_SUCCESS;
}

OpenMP Ordered

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
 
int nthreads;
 
void *perform_work(int thread_index) {
  printf("Hello, I'm thread number %d of %d threads\n", thread_index, nthreads);
  return NULL;
}
 
int main(int argc, char **argv) {
  if (argc < 2) {
    return EXIT_FAILURE;
  } else {
    nthreads = atoi(argv[1]);
  }
 
  omp_set_num_threads(nthreads);
 
// Parallel execution of threads in order
#pragma omp parallel for ordered
  for (int i = 0; i < nthreads; i++) {
#pragma omp ordered
    perform_work(i);
  }
 
  return EXIT_SUCCESS;
}

2. Calculation of π as the Limit of a Series

The Leibniz formula is one of many methods to calculate π numerically:

(a) Write a sequential program that calculates π using the Leibniz formula. Approximate at least 8 decimal places.

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
 
static inline long long timestamp() {
  struct timespec ts;
  long long timestamp;
  clock_gettime(CLOCK_REALTIME, &ts);
  timestamp = ts.tv_sec * 1000000000LL + ts.tv_nsec;
  return timestamp;
}
 
static double pi(int N) {
  double result = 0;
  for (int k = 0; k < N; k++) {
    double numerator = (k % 2 == 0) ? 1.0 : -1.0; // (-1)^k
    double denominator = (2 * k) + 1;
    result += (numerator / denominator);
  }
  return result;
}
 
int main(int argc, char *argv[]) {
  /** my implementation of the measurements **/
  int N = 2000;
 
  // Made the method take arguments cause why not
  if (argc < 2) {
    fprintf(stderr, "\nUsage: %s  <N: Integer_for_precision> \n", argv[0]);
    printf("Using default value 2000 for N\n\n");
  } else {
    if (strtol(argv[1], NULL, 10) == 0 || NULL) {
      fprintf(stderr, "Only Integers are accepted as an argument. \n", NULL);
      return EXIT_FAILURE;
    } else {
      N = strtol(argv[1], NULL, 10); // idk how cursed this is. Is this valid a
                                     // way to change char to int?
    }
  }
 
  const long long begin_timestamp = timestamp();
  const double result_Leibniz_N = pi(N);
  const long long end_timestamp = timestamp();
  const long long time = end_timestamp - begin_timestamp;
 
  printf("Leibniz: %f \n", result_Leibniz_N);
  printf("Pi: %f\n", result_Leibniz_N * 4);
  printf("Time needed: %lld\n", time);
 
  return EXIT_SUCCESS;
}
 

(b) Parallelize the program using OpenMP and evaluate the speedup compared to the sequential version for an increasing number of threads. Go from one to the total number of cores available on the system.

#define _GNU_SOURCE
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
 
static inline long long timestamp() {
  struct timespec ts;
  long long timestamp;
  clock_gettime(CLOCK_REALTIME, &ts);
  timestamp = ts.tv_sec * 1000000000LL + ts.tv_nsec;
  return timestamp;
}
 
static double pi(int N) {
  /** my implementation of the Leibniz formula **/
  double result = 0;
 
#pragma omp parallel for reduction(+ : result)
  for (int k = 0; k < N; k++) {
    double numerator = (k % 2 == 0) ? 1.0 : -1.0; // (-1)^k
    double denominator = (2 * k) + 1;
    result += (numerator / denominator);
  }
 
  return result;
}
 
int main(int argc, char *argv[]) {
  /** my implementation of the measurements **/
  int N = 2000;
 
  // Made the method take arguments cause why not
  if (argc < 2) {
    fprintf(stderr, "\nUsage: %s N: <Integer_for_precision> \n", argv[0]);
    printf("Using default value 2000 for N\n\n");
  } else {
    char *end;
    long val = strtol(argv[1], &end, 10);
    if (*end != '\0') {
      fprintf(stderr, "Only integers are accepted as an argument.\n");
      return EXIT_FAILURE;
    } else {
      N = (int)val;
    }
  }
 
  const long long begin_timestamp = timestamp();
  const double result_Leibniz_N = pi(N);
  const long long end_timestamp = timestamp();
  const long long time = end_timestamp - begin_timestamp;
 
  printf("Leibniz: %f \n", result_Leibniz_N);
  printf("Pi: %f\n", result_Leibniz_N * 4);
  printf("Time needed: %lld ns\n", time);
 
  return EXIT_SUCCESS;
}
 

Note: $lscpu or $cat /proc/cpuinfo

I was intrested in a comparison so I plotted it here are my results:

I used this script to calculate π using the Leibniz formula. The command I ran:

./pi_calculator 1000 5000000 1000000000

1000: Starting term. • 5000000: Step size. • 1000000000: End term.

This allowed me to analyze how the approximation improves with more terms in the series.

time_comparison_plot

speedup_plot


3. Parallel Prime Sieve

Write a program that calculates the number of primes between 1 and some upper limit , e.g., . Use the following snippet as a prime test:

bool isprime(int p)
{
    for (int d = 2; d < p; ++d)
        if (p % d == 0)
            return false;
 
    return true;
}

Experiment with OpenMP’s different loop scheduling options and evaluate their impact on the performance. You can either use the schedule clause or the OpenMP environment variable OMP_SCHEDULE.

There’s an error: 1 is returned as prime, but that’s incorrect. A prime has exactly two divisors: 1 and itself. Since 1 has only one divisor, it is not prime. I changed this in my solution below

#define _GNU_SOURCE
 
#include <omp.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
 
#define NMAX_DEFAULT 1000000
 
static inline long long timestamp() {
  struct timespec ts;
  long long timestamp;
  clock_gettime(CLOCK_REALTIME, &ts);
  timestamp = ts.tv_sec * 1000000000LL + ts.tv_nsec;
  return timestamp;
}
 
static bool isprime(int p) {
  if (p < 2)
    return false; // 1 is not a prime. issue in skeleton code
  for (int d = 2; d < p; ++d)
    if (p % d == 0)
      return false;
 
  return true;
}
 
static int num_primes(int N) {
  int prime_counter = 0;
#pragma omp parallel for reduction(+ : prime_counter)
  for (int i = 0; i < N; i++) {
    if (isprime(i)) {
      prime_counter++;
    }
  }
  return prime_counter;
}
 
int main(int argc, char *argv[]) {
  int n_max = NMAX_DEFAULT;
  int primes;
  long long ts_start;
  long long duration;
 
  printf("Prime Sieve\n");
  printf("    limit:  %d\n", n_max);
 
  if (argc >= 2) {
    n_max = atoi(argv[1]);
  }
 
  ts_start = timestamp();
  primes = num_primes(n_max);
  duration = timestamp() - ts_start;
 
  printf("    primes: %d   time: %0.4f ms\n", primes, 1.0e-6 * duration);
 
  return EXIT_SUCCESS;
}
 

Output

╰─ ./sieve 1000000
Prime Sieve
    limit:  1000000
    primes: 78498   time: 16284.0440 ms

Other Solutions and Benchmark

#define _GNU_SOURCE
 
#include <omp.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
 
#define NMAX_DEFAULT 1000000
 
static inline long long timestamp() {
  struct timespec ts;
  long long timestamp;
  clock_gettime(CLOCK_REALTIME, &ts);
  timestamp = ts.tv_sec * 1000000000LL + ts.tv_nsec;
  return timestamp;
}
 
static bool isprime(int p) {
  if (p < 2)
    return false; // 1 is not a prime
  for (int d = 2; d < p; ++d)
    if (p % d == 0)
      return false;
 
  return true;
}
 
// Serial implementation
static int num_primes_serial(int N) {
  int prime_counter = 0;
  for (int i = 0; i < N; i++) {
    if (isprime(i)) {
      prime_counter++;
    }
  }
  return prime_counter;
}
 
// Parallel implementation with OpenMP
static int num_primes_parallel(int N, const char *schedule_type, int chunk_size) {
  int prime_counter = 0;
  if (schedule_type == NULL || chunk_size < 1) {
    printf("Invalid scheduling parameters\n");
    return -1;
  }
 
#pragma omp parallel for schedule(runtime) reduction(+ : prime_counter)
  for (int i = 0; i < N; i++) {
    if (isprime(i)) {
      prime_counter++;
    }
  }
  return prime_counter;
}
 
int main(int argc, char *argv[]) {
  int n_max = NMAX_DEFAULT;
  int primes;
  long long ts_start;
  long long duration;
 
  if (argc >= 2) {
    n_max = atoi(argv[1]);
  }
 
  printf("Prime Sieve with Benchmarking\n");
  printf("    limit: %d\n", n_max);
 
  // Serial computation
  ts_start = timestamp();
  primes = num_primes_serial(n_max);
  duration = timestamp() - ts_start;
  printf("    [Serial]    primes: %d   time: %0.4f ms\n", primes, 1.0e-6 * duration);
 
  // Static scheduling
  omp_set_schedule(omp_sched_static, 100);
  ts_start = timestamp();
  primes = num_primes_parallel(n_max, "static", 100);
  duration = timestamp() - ts_start;
  printf("    [Static]    primes: %d   time: %0.4f ms\n", primes, 1.0e-6 * duration);
 
  // Dynamic scheduling
  omp_set_schedule(omp_sched_dynamic, 100);
  ts_start = timestamp();
  primes = num_primes_parallel(n_max, "dynamic", 100);
  duration = timestamp() - ts_start;
  printf("    [Dynamic]   primes: %d   time: %0.4f ms\n", primes, 1.0e-6 * duration);
 
  // Guided scheduling
  omp_set_schedule(omp_sched_guided, 100);
  ts_start = timestamp();
  primes = num_primes_parallel(n_max, "guided", 100);
  duration = timestamp() - ts_start;
  printf("    [Guided]    primes: %d   time: %0.4f ms\n", primes, 1.0e-6 * duration);
 
  return EXIT_SUCCESS;
}
╰─ make benchmark
cc -Xpreprocessor -fopenmp prime_sieve_scheduling.c -o sieve-benchmark -L/opt/homebrew/opt/libomp/lib -lomp -I/opt/homebrew/opt/libomp/include
 
Running benchmark with N=1000000
./sieve-benchmark 1000000
 
Prime Sieve with Benchmarking
    limit: 1000000
    [Serial]    primes: 78498   time: 64279.6030 ms
    [Static]    primes: 78498   time: 11042.1070 ms
    [Dynamic]   primes: 78498   time: 11012.4070 ms
    [Guided]    primes: 78498   time: 11182.5510 ms

Formula for Speedup

The speedup is calculated as:

Explanation

This formula measures how many times faster a parallel execution () is compared to a serial execution (). A higher indicates better parallel performance.


4. Generating Partial Sums in Parallel

For a given array A[0…N-1], a second array B[0…N-1] with the prefix sum of A should be generated. The prefix sum for the -th element of B is simply defined as:

Where is the -th element of .

Parallelize the generation of the prefix sums (calculation of the elements of ) in the following program with OpenMP and test the correctness of your implementation.

#include <omp.h>
#include <stdio.h>
 
#define N 1000
int main(int argc, char* argv[])
{
    int i;
    int A[N];
    int B[N];
    for (i = 0; i < N; i++) {
        A[i] = i;
    }
    B[0] = A[0];
    // Parallelize:
    for (i = 1; i < N; i++) {
        B[i] = B[i - 1] + A[i];
    }
    for (i = 0; i < N; i++) {
        printf("%d ", B[i]);
    }
}

Solution

#include <omp.h>
#include <stdio.h>
 
#define N 1000
int main(int argc, char *argv[]) {
  int i;
  int A[N];
  int B[N];
  for (i = 0; i < N; i++) {
    A[i] = i;
  }
  B[0] = A[0];
// Parallelize:
#pragma omp parallel
  for (i = 1; i < N; i++) {
    B[i] = B[i - 1] + A[i];
  }
  for (i = 0; i < N; i++) {
    printf("%d ", B[i]);
  }
}