Laden...

CUDA Installation und Test

Folgende Pakete muessen installiert sein:

 

     SuSE:

     zypper install gcc gcc-c++ make freeglut-devel kernel-source

 

     centos:

     yum -y install kernel.x86_64 kernel-headers.x86_64 kernel-devel.x86_64 freeglut.x86_64 freeglut-devel.x86_64

 

 

CUDA installieren:

 

     su - root

     cd

          FAI: ls /fs1/packetmirror/cuda

                    cudadriver_2.3_linux_64_190.18.run  cudatoolkit_2.3_linux_64_rhel5.3.run   cudasdk_2.3_linux.run

                    cudatoolkit_2.3_linux_64_suse11.1.run  NVIDIA_GPU_Computing_SDK-sysgen-opensuse-11.1

     rsync -aviP rsync -aviP 172.22.222.1:/fs1/packetmirror/cuda .

     chmod 755 *run

     ./cudadriver_2.3_linux_64_190.18.run -a

     ./cudasdk_2.3_linux.run --help

     ./cudasdk_2.3_linux.run --nox11

     SuSE:

     ./cudatoolkit_2.3_linux_64_suse11.1.run --nox11

     centos:

     ./cudatoolkit_2.3_linux_64_rhel5.3.run --nox11

     export PATH=/usr/local/cuda/bin:$PATH

     export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

     ldconfig

     vim /etc/ld.so.conf

             /usr/X11R6/lib64/Xaw3d

             /usr/X11R6/lib64

             /usr/lib64/Xaw3d

             /usr/X11R6/lib/Xaw3d

             /usr/X11R6/lib

             /usr/lib/Xaw3d

             /usr/x86_64-suse-linux/lib

             /usr/local/lib

             /opt/kde3/lib

             /lib64

             /lib

             /usr/lib64

             /usr/lib

             /usr/local/lib64

             /opt/kde3/lib64

             include /etc/ld.so.conf.d/*.conf

             /usr/local/cuda/lib64

     ldconfig

 

CUDA Beispiel-Applikationen, alle auf einmal installieren:

 

     pushd ~/NVIDIA_GPU_Computing_SDK/C

     make

     popd

 

CUDA Beispiel-Applikationen, einzeln installieren. z.B.:

 

     pushd ~/NVIDIA_GPU_Computing_SDK/C/src/simpleMultiGPU

     make

     popd

 

CUDA Beispiel-Applikationen koennen unter:

 

     pushd ~/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/

     ...

     popd

 

ausgefuehrt werden.

 

Damit die relativ kurzen Laeufe auch fuer Stabilitaetstests und Pruefung auf Waermestabilitaet verwendet werden koennen, wurden die Applikationen simpleMultiGPU und MonteCarloMultiGPU erweitert und durchlaufen jetzt 400 (interne) Schleifendurchgaenge. Die Anzahl der GPU-Karten wird automatisch ermittelt, sodass alle GPUs unter "Dampf" gestellt werden. Die abgeaenderten Beispiele haben den Postfix .long, sie ueberschreiben bei Kompilation die Beispiele simpleMultiGPU und MonteCarloMultiGPU unter ~/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/:

 

[root@node001-172 src]# vim simpleMultiGPU.long

[root@node001-172 src]# cd simpleMultiGPU.long

[root@node001-172 simpleMultiGPU.long]# ls

Makefile  obj  simpleMultiGPU.cpp  simpleMultiGPU.h  simpleMultiGPU_kernel.cu

[root@node001-172 simpleMultiGPU.long]# vim simpleMultiGPU.cpp  <-- Aenderungen in Fettdruck

// Data configuration

////////////////////////////////////////////////////////////////////////////////

const int MAX_GPU_COUNT = 8;

const long int        DATA_N = 1048576*32*32;

//const int        DATA_N = 1048576*32;

 

 

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char **argv){

    //Solver config

    TGPUplan      plan[MAX_GPU_COUNT];

    //GPU reduction results

    float     h_SumGPU[MAX_GPU_COUNT];

    //OS thread ID

    CUTThread threadID[MAX_GPU_COUNT];

 

    float *h_Data;

    float sumGPU;

    double sumCPU, diff;

 

    int i, gpuBase, GPU_N;

    unsigned int hTimer;

 

    cutilCheckError(cutCreateTimer(&hTimer));

 

    cutilSafeCall(cudaGetDeviceCount(&GPU_N));

    printf("GPU_N %d \n", GPU_N);

    if(GPU_N > MAX_GPU_COUNT) GPU_N = MAX_GPU_COUNT;

    printf("CUDA-capable device count: %i\n", GPU_N);

 

    printf("main(): generating input data...\n");

        h_Data = (float *)malloc(DATA_N * sizeof(float));

        for(i = 0; i < DATA_N; i++)

            h_Data[i] = (float)rand() / (float)RAND_MAX;

 

    //Subdividing input data across GPUs

    //Get data sizes for each GPU

    for(i = 0; i < GPU_N; i++)

        plan[i].dataN = DATA_N / GPU_N;

    //Take into account "odd" data sizes

    for(i = 0; i < DATA_N % GPU_N; i++)

        plan[i].dataN++;

    //Assign data ranges to GPUs

    gpuBase = 0;

    for(i = 0; i < GPU_N; i++){

        plan[i].device = i;

        plan[i].h_Data = h_Data + gpuBase;

        plan[i].h_Sum = h_SumGPU + i;

        gpuBase += plan[i].dataN;

    }

 

    //Start timing of GPU code

    printf("main(): waiting for GPU results...\n");

    for(int jj = 0; jj < 400; jj++) {

      printf("====================== %d ==============\n", jj);

      cutilCheckError(cutResetTimer(hTimer));

      cutilCheckError(cutStartTimer(hTimer));

          for(i = 0; i < GPU_N; i++)

              threadID[i] = cutStartThread((CUT_THREADROUTINE)solverThread, (void *)(plan + i));

          cutWaitForThreads(threadID, GPU_N);

          sumGPU = 0;

          for(i = 0; i < GPU_N; i++)

              sumGPU += h_SumGPU[i];

      cutilCheckError(cutStopTimer(hTimer));

      printf("GPU Processing time: %f (ms) \n", cutGetTimerValue(hTimer));

 

      printf("Checking the results...\n");

      cutilCheckError(cutResetTimer(hTimer));

      cutilCheckError(cutStartTimer(hTimer));

          sumCPU = 0;

          for(i = 0; i < DATA_N; i++)

              sumCPU += h_Data[i];

      cutilCheckError(cutStopTimer(hTimer));

      printf("CPU Processing time: %f (ms) \n", cutGetTimerValue(hTimer));

 

      diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);

      printf("GPU sum: %f; CPU sum: %f\n", sumGPU, sumCPU);

      printf("Relative difference: %E \n", diff);

      printf((diff < 1e-6) ? "TEST PASSED\n" : "TEST FAILED\n");

    }

 

    printf("Shutting down...\n");

        cutilCheckError(cutDeleteTimer(hTimer));

        free(h_Data);

 

    cudaThreadExit();

 

    //cutilExit(argc, argv);

}

[root@node001-172 cd ../MonteCarloMultiGPU.long

[root@node001-172 MonteCarloMultiGPU.long]# ls

his_cuda_11.10.09    MonteCarlo_gold.cpp     MonteCarlo_reduction.cuh  obj

Makefile             MonteCarlo_kernel.cuh   MonteCarlo_SM10.cu        quasirandomGenerator_kernel.cuh

MonteCarlo_common.h  MonteCarloMultiGPU.cpp  MonteCarlo_SM13.cu        realtype.h

[root@node001-172 MonteCarloMultiGPU.long]# vim MonteCarloMultiGPU.cpp

/*

 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.

 *

 * NVIDIA Corporation and its licensors retain all intellectual property and

 * proprietary rights in and to this software and related documentation and

 * any modifications thereto.  Any use, reproduction, disclosure, or distribution

 * of this software and related documentation without an express license

 * agreement from NVIDIA Corporation is strictly prohibited.

 *

 */

 

/*

 * This sample evaluates fair call price for a

 * given set of European options using Monte Carlo approach.

 * See supplied whitepaper for more explanations.

 */

 

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cuda_runtime.h>

#include <cutil_inline.h>

#include <multithreading.h>

#include "MonteCarlo_common.h"

 

#ifdef WIN32

#define strcasecmp strcmpi

#endif

 

////////////////////////////////////////////////////////////////////////////////

// Common functions

////////////////////////////////////////////////////////////////////////////////

float randFloat(float low, float high){

    float t = (float)rand() / (float)RAND_MAX;

    return (1.0f - t) * low + t * high;

}

 

///////////////////////////////////////////////////////////////////////////////

// CPU reference functions

///////////////////////////////////////////////////////////////////////////////

extern "C" void MonteCarloCPU(

    TOptionValue&   callValue,

    TOptionData optionData,

    float *h_Random,

    int pathN

);

 

//Black-Scholes formula for call options

extern "C" void BlackScholesCall(

    float& CallResult,

    TOptionData optionData

);

 

extern "C" double NormalDistribution(unsigned int i, unsigned int pathN);

extern "C" double MoroInvCND(double prob);

 

////////////////////////////////////////////////////////////////////////////////

// GPU kernel code

////////////////////////////////////////////////////////////////////////////////

extern "C" void initMonteCarlo_SM10(TOptionPlan *plan);

extern "C" void closeMonteCarlo_SM10(TOptionPlan *plan);

extern "C" void MonteCarlo_SM10(TOptionPlan *plan);

extern "C" void inverseCND_SM10(float *d_Output, float *d_Input, unsigned int N);

extern "C" void initMonteCarlo_SM13(TOptionPlan *plan);

extern "C" void closeMonteCarlo_SM13(TOptionPlan *plan);

extern "C" void MonteCarlo_SM13(TOptionPlan *plan);

extern "C" void inverseCND_SM13(float *d_Output, float *d_Input, unsigned int N);

 

////////////////////////////////////////////////////////////////////////////////

// GPU-driving host thread

////////////////////////////////////////////////////////////////////////////////

unsigned int useDoublePrecision;

 

static CUT_THREADPROC solverThread(TOptionPlan *plan){

    unsigned int hTimer;

    cutilCheckError( cutCreateTimer(&hTimer) );

 

    //Init GPU

    cutilSafeCall( cudaSetDevice(plan->device) );

 

    cudaDeviceProp deviceProp;

    cutilSafeCall(cudaGetDeviceProperties(&deviceProp, plan->device));

    int version = deviceProp.major * 10 + deviceProp.minor;

    if(useDoublePrecision && version < 13){

        printf("Double precision is not supported on device %i.\n", plan->device);

        exit(0);

    }

 

    //Allocate memory for normally distributed samples

    cutilSafeCall( cudaMalloc(

        (void **)&plan->d_Samples,

        plan->pathN * sizeof(float)

    ) );

 

    //Generate normally distributed samples

    if(useDoublePrecision)

        inverseCND_SM13(plan->d_Samples, NULL, plan->pathN);

    else

        inverseCND_SM10(plan->d_Samples, NULL, plan->pathN);

 

    //Allocate intermediate memory for MC integrator

    if(useDoublePrecision)

        initMonteCarlo_SM13(plan);

    else

        initMonteCarlo_SM10(plan);

 

    //Main computations

    cutilSafeCall( cudaThreadSynchronize() );

    cutilCheckError( cutResetTimer(hTimer) );

    cutilCheckError( cutStartTimer(hTimer) );

        if(useDoublePrecision)

            MonteCarlo_SM13(plan);

        else

            MonteCarlo_SM10(plan);

    cutilSafeCall( cudaThreadSynchronize() );

    cutilCheckError( cutStopTimer(hTimer) );

    plan->time = cutGetTimerValue(hTimer);

 

    //Shut down this GPU

    if(useDoublePrecision)

        closeMonteCarlo_SM13(plan);

    else

        closeMonteCarlo_SM10(plan);

    cutilSafeCall( cudaFree(plan->d_Samples) );

    cutilCheckError( cutDeleteTimer(hTimer) );

 

    cudaThreadExit();

 

    CUT_THREADEND;

}

 

///////////////////////////////////////////////////////////////////////////////

// Main program

///////////////////////////////////////////////////////////////////////////////

#define DO_CPU

#undef DO_CPU

 

#define PRINT_RESULTS

#undef PRINT_RESULTS

 

int main(int argc, char **argv){

    char *precisionChoice;

    cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);

    if(precisionChoice == NULL)

        useDoublePrecision = 0;

    else{

        if(!strcasecmp(precisionChoice, "double"))

            useDoublePrecision = 1;

        else

            useDoublePrecision = 0;

    }

 

    const int MAX_GPU_COUNT = 8;

    const int         OPT_N = 256;

    const int        PATH_N = 1 << 18;

    const unsigned int SEED = 777;

 

    //Input data array

    TOptionData optionData[OPT_N];

    //Final GPU MC results

    TOptionValue callValueGPU[OPT_N];

    //"Theoretical" call values by Black-Scholes formula

    float callValueBS[OPT_N];

    //Solver config

    TOptionPlan optionSolver[MAX_GPU_COUNT];

    //OS thread ID

    CUTThread threadID[MAX_GPU_COUNT];

 

    //GPU number present in the system

    int GPU_N;

    int gpuBase, gpuIndex;

    int i;

 

    double

        delta, ref, sumDelta, sumRef, sumReserve;

 

    cutilSafeCall( cudaGetDeviceCount(&GPU_N) );

 

#ifdef _EMU

        GPU_N = 1;

#endif

                printf("main(): generating input data...\n");

        srand(123);

        for(i = 0; i < OPT_N; i++){

            optionData[i].S = randFloat(5.0f, 50.0f);

            optionData[i].X = randFloat(10.0f, 25.0f);

            optionData[i].T = randFloat(1.0f, 5.0f);

            optionData[i].R = 0.06f;

            optionData[i].V = 0.10f;

            callValueGPU[i].Expected   = -1.0f;

            callValueGPU[i].Confidence = -1.0f;

        }

 

    printf("main(): starting %i host threads...\n", GPU_N);

 

    printf("main(): waiting for GPU results...\n");

    for(int jj = 0; jj < 400; jj++) {

      printf("====================== %d ==============\n", jj);

 

          //Get option count for each GPU

          for(i = 0; i < GPU_N; i++)

              optionSolver[i].optionCount = OPT_N / GPU_N;

          //Take into account cases with "odd" option counts

          for(i = 0; i < (OPT_N % GPU_N); i++)

              optionSolver[i].optionCount++;

 

          //Assign GPU option ranges

          gpuBase = 0;

          for(i = 0; i < GPU_N; i++){

              optionSolver[i].device     = i;

              optionSolver[i].optionData = optionData   + gpuBase;

              optionSolver[i].callValue  = callValueGPU + gpuBase;

              optionSolver[i].seed       = SEED;

              optionSolver[i].pathN      = PATH_N;

              gpuBase += optionSolver[i].optionCount;

          }

 

          //Start CPU thread for each GPU

          for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)

              threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);

 

      printf("main(): waiting for GPU results...\n");

          cutWaitForThreads(threadID, GPU_N);

 

      printf("main(): GPU statistics\n");

          for(i = 0; i < GPU_N; i++){

              printf("GPU #%i\n", optionSolver[i].device);

              printf("Options         : %i\n", optionSolver[i].optionCount);

              printf("Simulation paths: %i\n", optionSolver[i].pathN);

              printf("Time (ms.)      : %f\n", optionSolver[i].time);

              printf("Options per sec.: %f\n", optionSolver[i].optionCount / (optionSolver[i].time * 0.001));

          }

 

    }

 

#ifdef DO_CPU

    printf("main(): running CPU MonteCarlo...\n");

        TOptionValue callValueCPU;

        sumDelta = 0;

        sumRef   = 0;

        for(i = 0; i < OPT_N; i++){

            MonteCarloCPU(

                callValueCPU,

                optionData[i],

                NULL,

                PATH_N

            );

            delta     = fabs(callValueCPU.Expected - callValueGPU[i].Expected);

            ref       = callValueCPU.Expected;

            sumDelta += delta;

            sumRef   += fabs(ref);

            printf("Exp : %f | %f\t", callValueCPU.Expected,   callValueGPU[i].Expected);

            printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);

        }

    printf("L1 norm: %E\n", sumDelta / sumRef);

#endif

 

    printf("main(): comparing Monte Carlo and Black-Scholes results...\n");

        sumDelta   = 0;

        sumRef     = 0;

        sumReserve = 0;

        for(i = 0; i < OPT_N; i++){

            BlackScholesCall(

                callValueBS[i],

                optionData[i]

            );

            delta     = fabs(callValueBS[i] - callValueGPU[i].Expected);

            ref       = callValueBS[i];

            sumDelta += delta;

            sumRef   += fabs(ref);

            if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta;

#ifdef PRINT_RESULTS

            printf("BS: %f; delta: %E\n", callValueBS[i], delta);

#endif

        }

    sumReserve /= OPT_N;

    printf("L1 norm        : %E\n", sumDelta / sumRef);

    printf("Average reserve: %f\n", sumReserve);

    printf((sumReserve > 1.0f) ? "TEST PASSED\n" : "TEST FAILED.\n");

 

    printf("Shutting down...\n");

 

    cutilExit(argc, argv);

}

[root@node001-172 MonteCarloMultiGPU.long]#

Article Details: Views:
Last updated:
2010/07/15
Article
article viewed 150 times
Autor:
Autor
Dieter Nikisch