#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

#define NRA 4                          /* number of rows in matrix A */
#define NCA 4                          /* number of columns in matrix A = rows of B*/
#define CHUNK 2                     /*chunk size */

int main (int argc, char *argv[])
{
    int tid,i, j;
    int  a[NRA][NCA],                /* matrix A to be multiplied */
    b[NCA],                                /* matrix B to be multiplied */
    c[NRA],                                /* result matrix C */
    x[NRA][NCA/CHUNK];          /* Individual Thread results */
    for (i=0; i<NRA; i++) {
        for (j=0; j<NCA; j++) {
            a[i][j]=  i+j;
            printf(" a[%d,%d]=%i ",i,j,a[i][j]);
        }
        printf("\n");
    }
    for (i=0; i<NCA; i++) {
        b[i]= i;   
        printf("b[%i]=%i  ",i, b[i]);
    }
    printf("\nChecking\n");
    /*Verify result*/
    for (i=0; i<NRA; i++) {
        c[i]=0;
        for (j=0; j<NCA; j++) {
            c[i] =c[i]+a[i][j]*b[j];
        }
        printf("c[%i]= %i  ",i,c[i]);
    }
    printf("\n\nEntering Parallel Region\n");
#pragma omp parallel  num_threads(2) shared(a,b,c,x) private(tid,i,j)
    {
        tid = omp_get_thread_num();
        printf("Thread %d starting matrix multiplication\n",tid);
        for (i=0; i<NRA; i++) {
            x[i][tid]=0;
#pragma omp for schedule(static,CHUNK) /*nowait*/
            for (j=0; j<4; j++) {
                x[i][tid] =x[i][tid]+a[i][j]*b[j];
            }
            printf("Thread %i x[%i][%i] = %i\n",tid,i,tid, x[i][tid]);
        }
    }
    printf("\nLeft Parallel Region\n");

    for (i=0; i<NRA; i++) {
        c[i]=0;
        for (j=0;j<CHUNK;j++) {
            c[i]=c[i]+x[i][j];
        }
        printf("c[%i]=%i  ",i,c[i]);
    }
    printf("\n");
    return 0;
}