RandomAccess.c

/* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */

/*
 * This code has been contributed by the DARPA HPCS program.  Contact
 * David Koester <dkoester@mitre.org> or Bob Lucas <rflucas@isi.edu>
 * if you have questions.
 *
 *
 * GUPS (Giga UPdates per Second) is a measurement that profiles the memory
 * architecture of a system and is a measure of performance similar to MFLOPS.
 * The HPCS HPCchallenge RandomAccess benchmark is intended to exercise the
 * GUPS capability of a system, much like the LINPACK benchmark is intended to
 * exercise the MFLOPS capability of a computer.  In each case, we would
 * expect these benchmarks to achieve close to the "peak" capability of the
 * memory system. The extent of the similarities between RandomAccess and
 * LINPACK are limited to both benchmarks attempting to calculate a peak system
 * capability.
 *
 * GUPS is calculated by identifying the number of memory locations that can be
 * randomly updated in one second, divided by 1 billion (1e9). The term "randomly"
 * means that there is little relationship between one address to be updated and
 * the next, except that they occur in the space of one half the total system
 * memory.  An update is a read-modify-write operation on a table of 64-bit words.
 * An address is generated, the value at that address read from memory, modified
 * by an integer operation (add, and, or, xor) with a literal value, and that
 * new value is written back to memory.
 *
 * We are interested in knowing the GUPS performance of both entire systems and
 * system subcomponents --- e.g., the GUPS rating of a distributed memory
 * multiprocessor the GUPS rating of an SMP node, and the GUPS rating of a
 * single processor.  While there is typically a scaling of FLOPS with processor
 * count, a similar phenomenon may not always occur for GUPS.
 *
 * Select the memory size to be the power of two such that 2^n <= 1/2 of the
 * total memory.  Each CPU operates on its own address stream, and the single
 * table may be distributed among nodes. The distribution of memory to nodes
 * is left to the implementer.  A uniform data distribution may help balance
 * the workload, while non-uniform data distributions may simplify the
 * calculations that identify processor location by eliminating the requirement
 * for integer divides. A small (less than 1%) percentage of missed updates
 * are permitted.
 *
 * When implementing a benchmark that measures GUPS on a distributed memory
 * multiprocessor system, it may be required to define constraints as to how
 * far in the random address stream each node is permitted to "look ahead".
 * Likewise, it may be required to define a constraint as to the number of
 * update messages that can be stored before processing to permit multi-level
 * parallelism for those systems that support such a paradigm.  The limits on
 * "look ahead" and "stored updates" are being implemented to assure that the
 * benchmark meets the intent to profile memory architecture and not induce
 * significant artificial data locality. For the purpose of measuring GUPS,
 * we will stipulate that each thread is permitted to look ahead no more than
 * 1024 random address stream samples with the same number of update messages
 * stored before processing.
 *
 * The supplied MPI-1 code generates the input stream {A} on all processors
 * and the global table has been distributed as uniformly as possible to
 * balance the workload and minimize any Amdahl fraction.  This code does not
 * exploit "look-ahead".  Addresses are sent to the appropriate processor
 * where the table entry resides as soon as each address is calculated.
 * Updates are performed as addresses are received.  Each message is limited
 * to a single 64 bit long integer containing element ai from {A}.
 * Local offsets for T[ ] are extracted by the destination processor.
 *
 * If the number of processors is equal to a power of two, then the global
 * table can be distributed equally over the processors.  In addition, the
 * processor number can be determined from that portion of the input stream
 * that identifies the address into the global table by masking off log2(p)
 * bits in the address.
 *
 * If the number of processors is not equal to a power of two, then the global
 * table cannot be equally distributed between processors.  In the MPI-1
 * implementation provided, there has been an attempt to minimize the differences
 * in workloads and the largest difference in elements of T[ ] is one.  The
 * number of values in the input stream generated by each processor will be
 * related to the number of global table entries on each processor.
 *
 * The MPI-1 version of RandomAccess treats the potential instance where the
 * number of processors is a power of two as a special case, because of the
 * significant simplifications possible because processor location and local
 * offset can be determined by applying masks to the input stream values.
 * The non power of two case uses an integer division to determine the processor
 * location.  The integer division will be more costly in terms of machine
 * cycles to perform than the bit masking operations
 *
 * For additional information on the GUPS metric, the HPCchallenge RandomAccess
 * Benchmark,and the rules to run RandomAccess or modify it to optimize
 * performance -- see http://icl.cs.utk.edu/hpcc/
 *
 */

/* Jan 2005
 *
 * This code has been modified to allow local bucket sorting of updates.
 * The total maximum number of updates in the local buckets of a process
 * is currently defined in "RandomAccess.h" as MAX_TOTAL_PENDING_UPDATES.
 * When the total maximum number of updates is reached, the process selects
 * the bucket (or destination process) with the largest number of
 * updates and sends out all the updates in that bucket. See buckets.c
 * for details about the buckets' implementation.
 *
 * This code also supports posting multiple MPI receive descriptors (based
 * on a contribution by David Addison).
 *
 * In addition, this implementation provides an option for limiting
 * the execution time of the benchmark to a specified time bound
 * (see time_bound.c). The time bound is currently defined in
 * time_bound.h, but it should be a benchmark parameter. By default
 * the benchmark will execute the recommended number of updates,
 * that is, four times the global table size.
 */

/*
 * OpenSHMEM version:
 *
 * Copyright (c) 2011 - 2015
 *   University of Houston System and UT-Battelle, LLC.
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * o Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *
 * o Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *
 * o Neither the name of the University of Houston System,
 *   UT-Battelle, LLC. nor the names of its contributors may be used to
 *   endorse or promote products derived from this software without specific
 *   prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR} IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <hpcc.h>

#include "RandomAccess.h"

#include <stdio.h>

#include <shmem.h>

/* Allocate main table (in global memory) */
u64Int *HPCC_Table;

u64Int LocalSendBuffer[LOCAL_BUFFER_SIZE];
u64Int LocalRecvBuffer[MAX_RECV*LOCAL_BUFFER_SIZE];

int
HPCC_SHMEMRandomAccess(HPCC_Params *params) {
  s64Int i;

  int NumProcs, logNumProcs, MyProc;
  u64Int GlobalStartMyProc;
  int Remainder;            /* Number of processors with (LocalTableSize + 1) entries */
  u64Int Top;               /* Number of table entries in top of Table */
  s64Int LocalTableSize;    /* Local table width */
  u64Int MinLocalTableSize; /* Integer ratio TableSize/NumProcs */
  u64Int logTableSize, TableSize;

  double CPUTime;               /* CPU  time to update table */
  double RealTime;              /* Real time to update table */

  double TotalMem;
  int PowerofTwo;

  double timeBound = -1;  /* OPTIONAL time bound for execution time */
  u64Int NumUpdates_Default; /* Number of updates to table (suggested: 4x number of table entries) */
  u64Int NumUpdates;  /* actual number of updates to table - may be smaller than
                       * NumUpdates_Default due to execution time bounds */
  s64Int ProcNumUpdates; /* number of updates per processor */
  s64Int *NumErrors, *GlbNumErrors;

#ifdef RA_TIME_BOUND
  s64Int GlbNumUpdates;  /* for reduction */
#endif

  long *llpSync;
  long long *llpWrk;

  long *ipSync;
  int *ipWrk;

  FILE *outFile = NULL;
  double *GUPs;
  double *temp_GUPs;


  int numthreads;
  int *sAbort, *rAbort;


  /*Allocate symmetric memory*/
  sAbort = (int *)shmem_malloc(sizeof(int));
  rAbort = (int *)shmem_malloc(sizeof(int));
  llpSync = (long *)shmem_malloc(sizeof(long) *_SHMEM_BCAST_SYNC_SIZE);
  llpWrk = (long long *)shmem_malloc(sizeof(long long) * _SHMEM_REDUCE_SYNC_SIZE);
  ipSync = (long *)shmem_malloc(sizeof(long) *_SHMEM_BCAST_SYNC_SIZE);
  ipWrk = (int *)shmem_malloc(sizeof(int) * _SHMEM_REDUCE_SYNC_SIZE);

  GUPs = (double *)shmem_malloc(sizeof(double));
  temp_GUPs = (double *)shmem_malloc(sizeof(double));
  GlbNumErrors = (s64Int *)shmem_malloc(sizeof(s64Int));
  NumErrors = (s64Int *)shmem_malloc(sizeof(s64Int));

  *GlbNumErrors = 0;
  *NumErrors = 0;

  for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1){
        ipSync[i] = _SHMEM_SYNC_VALUE;
        llpSync[i] = _SHMEM_SYNC_VALUE;
  }


  params->SHMEMGUPs = -1;
  GUPs = &params->SHMEMGUPs;

  NumProcs = shmem_n_pes();
  MyProc = shmem_my_pe();

  if (0 == MyProc) {
    outFile = stdout;
    setbuf(outFile, NULL);
  }

  params->HPLMaxProcMem = 200000;

  TotalMem = params->HPLMaxProcMem; /* max single node memory */
  TotalMem *= NumProcs;             /* max memory in NumProcs nodes */

  TotalMem /= sizeof(u64Int);

  /* calculate TableSize --- the size of update array (must be a power of 2) */
  for (TotalMem *= 0.5, logTableSize = 0, TableSize = 1;
       TotalMem >= 1.0;
       TotalMem *= 0.5, logTableSize++, TableSize <<= 1)
    ; /* EMPTY */


  /* determine whether the number of processors is a power of 2 */
  if ( (NumProcs & (NumProcs -1)) == 0) {
    PowerofTwo = HPCC_TRUE;
    Remainder = 0;
    Top = 0;
    MinLocalTableSize = (TableSize / NumProcs);
    LocalTableSize = MinLocalTableSize;
    GlobalStartMyProc = (MinLocalTableSize * MyProc);
  }
  else {
    if(MyProc == 0) {
        printf("Number of processes must be power of 2\n");

    }
    return 0;
  }
  *sAbort = 0;
  HPCC_Table = HPCC_XMALLOC( s64Int, LocalTableSize );

  if (! HPCC_Table) *sAbort = 1;


  shmem_barrier_all();
  shmem_int_sum_to_all(rAbort, sAbort, 1, 0, 0, NumProcs, ipWrk, ipSync);
  shmem_barrier_all();

  if (*rAbort > 0) {
    if (MyProc == 0) fprintf(outFile, "Failed to allocate memory for the main table.\n");
    /* check all allocations in case there are new added and their order changes */
    if (HPCC_Table) HPCC_free( HPCC_Table );
    goto failed_table;
  }

  params->SHMEMRandomAccess_N = (s64Int)TableSize;

  /* Default number of global updates to table: 4x number of table entries */
  NumUpdates_Default = 4 * TableSize;
  ProcNumUpdates = 4*LocalTableSize;
  NumUpdates = NumUpdates_Default;

  if (MyProc == 0) {
    fprintf( outFile, "Running on %d processors%s\n", NumProcs, PowerofTwo ? " (PowerofTwo)" : "");
    fprintf( outFile, "Total Main table size = 2^" FSTR64 " = " FSTR64 " words\n",
             logTableSize, TableSize );
    if (PowerofTwo)
        fprintf( outFile, "PE Main table size = 2^" FSTR64 " = " FSTR64 " words/PE\n",
                 (logTableSize - logNumProcs), TableSize/NumProcs );
      else
        fprintf( outFile, "PE Main table size = (2^" FSTR64 ")/%d  = " FSTR64 " words/PE MAX\n",
                 logTableSize, NumProcs, LocalTableSize);

    fprintf( outFile, "Default number of updates (RECOMMENDED) = " FSTR64 "\n", NumUpdates_Default);
    params->SHMEMRandomAccess_ExeUpdates = NumUpdates;
  }

  /* Initialize main table */
  for (i=0; i<LocalTableSize; i++)
    HPCC_Table[i] = i + GlobalStartMyProc;

  shmem_barrier_all();

  /* Begin timed section */
  RealTime = -RTSEC();

  Power2NodesRandomAccessUpdate(logTableSize, TableSize, LocalTableSize,
                                     MinLocalTableSize, GlobalStartMyProc, Top,
                                     logNumProcs, NumProcs, Remainder,
                                     MyProc, ProcNumUpdates);

  shmem_barrier_all();

  /* End timed section */

  RealTime += RTSEC();
   // RealTime= 0.0;//testing

  /* Print timing results */
  if (MyProc == 0){
    params->SHMEMRandomAccess_time = RealTime;
    *GUPs = 1e-9*NumUpdates / RealTime;
    fprintf( outFile, "Real time used = %.6f seconds\n", RealTime );
    fprintf( outFile, "%.9f Billion(10^9) Updates    per second [GUP/s]\n",
             *GUPs );
    fprintf( outFile, "%.9f Billion(10^9) Updates/PE per second [GUP/s]\n",
             *GUPs / NumProcs );
    /* No longer reporting per CPU number */
    /* *GUPs /= NumProcs; */
  }
  /* distribute result to all nodes */
  temp_GUPs = GUPs;
  shmem_barrier_all();
  shmem_broadcast64(GUPs,temp_GUPs,1,0,0,0,NumProcs,llpSync);

  /* Verification phase */

  /* Begin timing here */
  shmem_barrier_all(); 

  RealTime = -RTSEC();


  HPCC_Power2NodesSHMEMRandomAccessCheck(logTableSize, TableSize, LocalTableSize,
                                    GlobalStartMyProc,
                                    logNumProcs, NumProcs,
                                    MyProc, ProcNumUpdates,
                                    NumErrors);

  shmem_barrier_all(); 
  shmem_longlong_sum_to_all( GlbNumErrors,  NumErrors, 1, 0,0, NumProcs,llpWrk, llpSync);
  /* End timed section */

  RealTime += RTSEC();
  //  RealTime= 0.0;//testing

  if(MyProc == 0){
    params->SHMEMRandomAccess_CheckTime = RealTime;

    fprintf( outFile, "Verification:  Real time used = %.6f seconds\n", RealTime);
    fprintf( outFile, "Found " FSTR64 " errors in " FSTR64 " locations (%s).\n",
             GlbNumErrors, TableSize, (*GlbNumErrors <= 0.01*TableSize) ?
             "passed" : "failed");
    if (*GlbNumErrors > 0.01*TableSize) params->Failure = 1;
    params->SHMEMRandomAccess_Errors = (s64Int) *GlbNumErrors;
    params->SHMEMRandomAccess_ErrorsFraction = (double) *GlbNumErrors / (double)TableSize;
    params->SHMEMRandomAccess_Algorithm = 1;
  }
  /* End verification phase */


  /* Deallocate memory (in reverse order of allocation which should
     help fragmentation) */

  HPCC_free( HPCC_Table );
  failed_table:

  if (0 == MyProc) if (outFile != stderr) fclose( outFile );

  shmem_barrier_all();

  shmem_free(sAbort); 
  shmem_free(rAbort); 
  shmem_free(llpSync); 
  shmem_free(llpWrk);
  shmem_free(ipSync);
  shmem_free(ipWrk);

  return 0;
}