#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// This was so ugly man 
void reduce_tree(int *sendbuf, int *recvbuf, int count, int root, MPI_Comm comm) {
    int rank, size;
    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &size);

    int *local = (int*)malloc(count * sizeof(int));
    memcpy(local, sendbuf, count * sizeof(int));

    int left = 2*rank + 1;
    int right = 2*rank + 2;
    int parent = (rank == root) ? -1 : (rank - 1)/2;
    MPI_Status status;

    if (left < size) {
        int *tmp = (int*)malloc(count * sizeof(int));
        MPI_Recv(tmp, count, MPI_INT, left, 0, comm, &status);
        for(int i=0;i<count;i++) local[i]+=tmp[i];
        free(tmp);
    }
    if (right < size) {
        int *tmp = (int*)malloc(count * sizeof(int));
        MPI_Recv(tmp, count, MPI_INT, right, 0, comm, &status);
        for(int i=0;i<count;i++) local[i]+=tmp[i];
        free(tmp);
    }
    if(rank != root)
        MPI_Send(local, count, MPI_INT, parent, 0, comm);
    else
        memcpy(recvbuf, local, count*sizeof(int));

    free(local);
}


void sequential_reduce(int *all_data, int num_procs, int count, int *result) {
    for(int i=0;i<count;i++) result[i]=0;
    for(int p=0;p<num_procs;p++)
        for(int i=0;i<count;i++)
            result[i] += all_data[p*count + i];
}

int main(int argc, char **argv) {
    MPI_Init(&argc,&argv);

    int rank,size;
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
    MPI_Comm_size(MPI_COMM_WORLD,&size);

    if(argc!=2){
        if(rank==0) printf("Usage: %s <array_size>\n",argv[0]);
        MPI_Finalize();
        return 0;
    }

    int N = atoi(argv[1]);
    int *sendbuf = (int*)malloc(N*sizeof(int));
    srand(rank+1);
    for(int i=0;i<N;i++) sendbuf[i]=rand()%10;

    int *recvbuf = (rank==0)?malloc(N*sizeof(int)):NULL;

    MPI_Barrier(MPI_COMM_WORLD);
    double t1 = MPI_Wtime();
    reduce_tree(sendbuf,recvbuf,N,0,MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);
    double t2 = MPI_Wtime();
    double parallel_time = t2-t1;

    
    int *all_data = NULL;
    if(rank==0) all_data = (int*)malloc(size*N*sizeof(int));
    MPI_Gather(sendbuf,N,MPI_INT,all_data,N,MPI_INT,0,MPI_COMM_WORLD);

    double t3 = MPI_Wtime();
    int *seq_result=NULL;
    if(rank==0){
        seq_result = (int*)malloc(N*sizeof(int));
        sequential_reduce(all_data,size,N,seq_result);
    }
    double t4 = MPI_Wtime();
    double sequential_time = t4-t3;

    if(rank==0){
        printf("Processes: %d\n",size);
        printf("Array size: %d\n",N);
        printf("Parallel reduce time: %f s\n",parallel_time);
        printf("Sequential reduce time: %f s\n",sequential_time);
        // correctness
        int correct=1;
        for(int i=0;i<N;i++) if(seq_result[i]!=recvbuf[i]) {correct=0; break;}
        printf("Correct: %s\n",correct?"True":"False");
        free(all_data);
        free(seq_result);
        free(recvbuf);
    }

    free(sendbuf);
    MPI_Finalize();
    return 0;
}

