CUDA Installation und Test
Folgende Pakete muessen installiert sein:
SuSE:
zypper install gcc gcc-c++ make freeglut-devel kernel-source
centos:
yum -y install kernel.x86_64 kernel-headers.x86_64 kernel-devel.x86_64 freeglut.x86_64 freeglut-devel.x86_64
CUDA installieren:
su - root
cd
FAI: ls /fs1/packetmirror/cuda
cudadriver_2.3_linux_64_190.18.run cudatoolkit_2.3_linux_64_rhel5.3.run cudasdk_2.3_linux.run
cudatoolkit_2.3_linux_64_suse11.1.run NVIDIA_GPU_Computing_SDK-sysgen-opensuse-11.1
rsync -aviP rsync -aviP 172.22.222.1:/fs1/packetmirror/cuda .
chmod 755 *run
./cudadriver_2.3_linux_64_190.18.run -a
./cudasdk_2.3_linux.run --help
./cudasdk_2.3_linux.run --nox11
SuSE:
./cudatoolkit_2.3_linux_64_suse11.1.run --nox11
centos:
./cudatoolkit_2.3_linux_64_rhel5.3.run --nox11
export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ldconfig
vim /etc/ld.so.conf
/usr/X11R6/lib64/Xaw3d
/usr/X11R6/lib64
/usr/lib64/Xaw3d
/usr/X11R6/lib/Xaw3d
/usr/X11R6/lib
/usr/lib/Xaw3d
/usr/x86_64-suse-linux/lib
/usr/local/lib
/opt/kde3/lib
/lib64
/lib
/usr/lib64
/usr/lib
/usr/local/lib64
/opt/kde3/lib64
include /etc/ld.so.conf.d/*.conf
/usr/local/cuda/lib64
ldconfig
CUDA Beispiel-Applikationen, alle auf einmal installieren:
pushd ~/NVIDIA_GPU_Computing_SDK/C
make
popd
CUDA Beispiel-Applikationen, einzeln installieren. z.B.:
pushd ~/NVIDIA_GPU_Computing_SDK/C/src/simpleMultiGPU
make
popd
CUDA Beispiel-Applikationen koennen unter:
pushd ~/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/
...
popd
ausgefuehrt werden.
Damit die relativ kurzen Laeufe auch fuer Stabilitaetstests und Pruefung auf Waermestabilitaet verwendet werden koennen, wurden die Applikationen simpleMultiGPU und MonteCarloMultiGPU erweitert und durchlaufen jetzt 400 (interne) Schleifendurchgaenge. Die Anzahl der GPU-Karten wird automatisch ermittelt, sodass alle GPUs unter "Dampf" gestellt werden. Die abgeaenderten Beispiele haben den Postfix .long, sie ueberschreiben bei Kompilation die Beispiele simpleMultiGPU und MonteCarloMultiGPU unter ~/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/:
[root@node001-172 src]# vim simpleMultiGPU.long
[root@node001-172 src]# cd simpleMultiGPU.long
[root@node001-172 simpleMultiGPU.long]# ls
Makefile obj simpleMultiGPU.cpp simpleMultiGPU.h simpleMultiGPU_kernel.cu
[root@node001-172 simpleMultiGPU.long]# vim simpleMultiGPU.cpp <-- Aenderungen in Fettdruck
// Data configuration
////////////////////////////////////////////////////////////////////////////////
const int MAX_GPU_COUNT = 8;
const long int DATA_N = 1048576*32*32;
//const int DATA_N = 1048576*32;
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv){
//Solver config
TGPUplan plan[MAX_GPU_COUNT];
//GPU reduction results
float h_SumGPU[MAX_GPU_COUNT];
//OS thread ID
CUTThread threadID[MAX_GPU_COUNT];
float *h_Data;
float sumGPU;
double sumCPU, diff;
int i, gpuBase, GPU_N;
unsigned int hTimer;
cutilCheckError(cutCreateTimer(&hTimer));
cutilSafeCall(cudaGetDeviceCount(&GPU_N));
printf("GPU_N %d \n", GPU_N);
if(GPU_N > MAX_GPU_COUNT) GPU_N = MAX_GPU_COUNT;
printf("CUDA-capable device count: %i\n", GPU_N);
printf("main(): generating input data...\n");
h_Data = (float *)malloc(DATA_N * sizeof(float));
for(i = 0; i < DATA_N; i++)
h_Data[i] = (float)rand() / (float)RAND_MAX;
//Subdividing input data across GPUs
//Get data sizes for each GPU
for(i = 0; i < GPU_N; i++)
plan[i].dataN = DATA_N / GPU_N;
//Take into account "odd" data sizes
for(i = 0; i < DATA_N % GPU_N; i++)
plan[i].dataN++;
//Assign data ranges to GPUs
gpuBase = 0;
for(i = 0; i < GPU_N; i++){
plan[i].device = i;
plan[i].h_Data = h_Data + gpuBase;
plan[i].h_Sum = h_SumGPU + i;
gpuBase += plan[i].dataN;
}
//Start timing of GPU code
printf("main(): waiting for GPU results...\n");
for(int jj = 0; jj < 400; jj++) {
printf("====================== %d ==============\n", jj);
cutilCheckError(cutResetTimer(hTimer));
cutilCheckError(cutStartTimer(hTimer));
for(i = 0; i < GPU_N; i++)
threadID[i] = cutStartThread((CUT_THREADROUTINE)solverThread, (void *)(plan + i));
cutWaitForThreads(threadID, GPU_N);
sumGPU = 0;
for(i = 0; i < GPU_N; i++)
sumGPU += h_SumGPU[i];
cutilCheckError(cutStopTimer(hTimer));
printf("GPU Processing time: %f (ms) \n", cutGetTimerValue(hTimer));
printf("Checking the results...\n");
cutilCheckError(cutResetTimer(hTimer));
cutilCheckError(cutStartTimer(hTimer));
sumCPU = 0;
for(i = 0; i < DATA_N; i++)
sumCPU += h_Data[i];
cutilCheckError(cutStopTimer(hTimer));
printf("CPU Processing time: %f (ms) \n", cutGetTimerValue(hTimer));
diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
printf("GPU sum: %f; CPU sum: %f\n", sumGPU, sumCPU);
printf("Relative difference: %E \n", diff);
printf((diff < 1e-6) ? "TEST PASSED\n" : "TEST FAILED\n");
}
printf("Shutting down...\n");
cutilCheckError(cutDeleteTimer(hTimer));
free(h_Data);
cudaThreadExit();
//cutilExit(argc, argv);
}
[root@node001-172 cd ../MonteCarloMultiGPU.long
[root@node001-172 MonteCarloMultiGPU.long]# ls
his_cuda_11.10.09 MonteCarlo_gold.cpp MonteCarlo_reduction.cuh obj
Makefile MonteCarlo_kernel.cuh MonteCarlo_SM10.cu quasirandomGenerator_kernel.cuh
MonteCarlo_common.h MonteCarloMultiGPU.cpp MonteCarlo_SM13.cu realtype.h
[root@node001-172 MonteCarloMultiGPU.long]# vim MonteCarloMultiGPU.cpp
/*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation and
* any modifications thereto. Any use, reproduction, disclosure, or distribution
* of this software and related documentation without an express license
* agreement from NVIDIA Corporation is strictly prohibited.
*
*/
/*
* This sample evaluates fair call price for a
* given set of European options using Monte Carlo approach.
* See supplied whitepaper for more explanations.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <multithreading.h>
#include "MonteCarlo_common.h"
#ifdef WIN32
#define strcasecmp strcmpi
#endif
////////////////////////////////////////////////////////////////////////////////
// Common functions
////////////////////////////////////////////////////////////////////////////////
float randFloat(float low, float high){
float t = (float)rand() / (float)RAND_MAX;
return (1.0f - t) * low + t * high;
}
///////////////////////////////////////////////////////////////////////////////
// CPU reference functions
///////////////////////////////////////////////////////////////////////////////
extern "C" void MonteCarloCPU(
TOptionValue& callValue,
TOptionData optionData,
float *h_Random,
int pathN
);
//Black-Scholes formula for call options
extern "C" void BlackScholesCall(
float& CallResult,
TOptionData optionData
);
extern "C" double NormalDistribution(unsigned int i, unsigned int pathN);
extern "C" double MoroInvCND(double prob);
////////////////////////////////////////////////////////////////////////////////
// GPU kernel code
////////////////////////////////////////////////////////////////////////////////
extern "C" void initMonteCarlo_SM10(TOptionPlan *plan);
extern "C" void closeMonteCarlo_SM10(TOptionPlan *plan);
extern "C" void MonteCarlo_SM10(TOptionPlan *plan);
extern "C" void inverseCND_SM10(float *d_Output, float *d_Input, unsigned int N);
extern "C" void initMonteCarlo_SM13(TOptionPlan *plan);
extern "C" void closeMonteCarlo_SM13(TOptionPlan *plan);
extern "C" void MonteCarlo_SM13(TOptionPlan *plan);
extern "C" void inverseCND_SM13(float *d_Output, float *d_Input, unsigned int N);
////////////////////////////////////////////////////////////////////////////////
// GPU-driving host thread
////////////////////////////////////////////////////////////////////////////////
unsigned int useDoublePrecision;
static CUT_THREADPROC solverThread(TOptionPlan *plan){
unsigned int hTimer;
cutilCheckError( cutCreateTimer(&hTimer) );
//Init GPU
cutilSafeCall( cudaSetDevice(plan->device) );
cudaDeviceProp deviceProp;
cutilSafeCall(cudaGetDeviceProperties(&deviceProp, plan->device));
int version = deviceProp.major * 10 + deviceProp.minor;
if(useDoublePrecision && version < 13){
printf("Double precision is not supported on device %i.\n", plan->device);
exit(0);
}
//Allocate memory for normally distributed samples
cutilSafeCall( cudaMalloc(
(void **)&plan->d_Samples,
plan->pathN * sizeof(float)
) );
//Generate normally distributed samples
if(useDoublePrecision)
inverseCND_SM13(plan->d_Samples, NULL, plan->pathN);
else
inverseCND_SM10(plan->d_Samples, NULL, plan->pathN);
//Allocate intermediate memory for MC integrator
if(useDoublePrecision)
initMonteCarlo_SM13(plan);
else
initMonteCarlo_SM10(plan);
//Main computations
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutResetTimer(hTimer) );
cutilCheckError( cutStartTimer(hTimer) );
if(useDoublePrecision)
MonteCarlo_SM13(plan);
else
MonteCarlo_SM10(plan);
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutStopTimer(hTimer) );
plan->time = cutGetTimerValue(hTimer);
//Shut down this GPU
if(useDoublePrecision)
closeMonteCarlo_SM13(plan);
else
closeMonteCarlo_SM10(plan);
cutilSafeCall( cudaFree(plan->d_Samples) );
cutilCheckError( cutDeleteTimer(hTimer) );
cudaThreadExit();
CUT_THREADEND;
}
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
#define DO_CPU
#undef DO_CPU
#define PRINT_RESULTS
#undef PRINT_RESULTS
int main(int argc, char **argv){
char *precisionChoice;
cutGetCmdLineArgumentstr(argc, (const char **)argv, "type", &precisionChoice);
if(precisionChoice == NULL)
useDoublePrecision = 0;
else{
if(!strcasecmp(precisionChoice, "double"))
useDoublePrecision = 1;
else
useDoublePrecision = 0;
}
const int MAX_GPU_COUNT = 8;
const int OPT_N = 256;
const int PATH_N = 1 << 18;
const unsigned int SEED = 777;
//Input data array
TOptionData optionData[OPT_N];
//Final GPU MC results
TOptionValue callValueGPU[OPT_N];
//"Theoretical" call values by Black-Scholes formula
float callValueBS[OPT_N];
//Solver config
TOptionPlan optionSolver[MAX_GPU_COUNT];
//OS thread ID
CUTThread threadID[MAX_GPU_COUNT];
//GPU number present in the system
int GPU_N;
int gpuBase, gpuIndex;
int i;
double
delta, ref, sumDelta, sumRef, sumReserve;
cutilSafeCall( cudaGetDeviceCount(&GPU_N) );
#ifdef _EMU
GPU_N = 1;
#endif
printf("main(): generating input data...\n");
srand(123);
for(i = 0; i < OPT_N; i++){
optionData[i].S = randFloat(5.0f, 50.0f);
optionData[i].X = randFloat(10.0f, 25.0f);
optionData[i].T = randFloat(1.0f, 5.0f);
optionData[i].R = 0.06f;
optionData[i].V = 0.10f;
callValueGPU[i].Expected = -1.0f;
callValueGPU[i].Confidence = -1.0f;
}
printf("main(): starting %i host threads...\n", GPU_N);
printf("main(): waiting for GPU results...\n");
for(int jj = 0; jj < 400; jj++) {
printf("====================== %d ==============\n", jj);
//Get option count for each GPU
for(i = 0; i < GPU_N; i++)
optionSolver[i].optionCount = OPT_N / GPU_N;
//Take into account cases with "odd" option counts
for(i = 0; i < (OPT_N % GPU_N); i++)
optionSolver[i].optionCount++;
//Assign GPU option ranges
gpuBase = 0;
for(i = 0; i < GPU_N; i++){
optionSolver[i].device = i;
optionSolver[i].optionData = optionData + gpuBase;
optionSolver[i].callValue = callValueGPU + gpuBase;
optionSolver[i].seed = SEED;
optionSolver[i].pathN = PATH_N;
gpuBase += optionSolver[i].optionCount;
}
//Start CPU thread for each GPU
for(gpuIndex = 0; gpuIndex < GPU_N; gpuIndex++)
threadID[gpuIndex] = cutStartThread((CUT_THREADROUTINE)solverThread, &optionSolver[gpuIndex]);
printf("main(): waiting for GPU results...\n");
cutWaitForThreads(threadID, GPU_N);
printf("main(): GPU statistics\n");
for(i = 0; i < GPU_N; i++){
printf("GPU #%i\n", optionSolver[i].device);
printf("Options : %i\n", optionSolver[i].optionCount);
printf("Simulation paths: %i\n", optionSolver[i].pathN);
printf("Time (ms.) : %f\n", optionSolver[i].time);
printf("Options per sec.: %f\n", optionSolver[i].optionCount / (optionSolver[i].time * 0.001));
}
}
#ifdef DO_CPU
printf("main(): running CPU MonteCarlo...\n");
TOptionValue callValueCPU;
sumDelta = 0;
sumRef = 0;
for(i = 0; i < OPT_N; i++){
MonteCarloCPU(
callValueCPU,
optionData[i],
NULL,
PATH_N
);
delta = fabs(callValueCPU.Expected - callValueGPU[i].Expected);
ref = callValueCPU.Expected;
sumDelta += delta;
sumRef += fabs(ref);
printf("Exp : %f | %f\t", callValueCPU.Expected, callValueGPU[i].Expected);
printf("Conf: %f | %f\n", callValueCPU.Confidence, callValueGPU[i].Confidence);
}
printf("L1 norm: %E\n", sumDelta / sumRef);
#endif
printf("main(): comparing Monte Carlo and Black-Scholes results...\n");
sumDelta = 0;
sumRef = 0;
sumReserve = 0;
for(i = 0; i < OPT_N; i++){
BlackScholesCall(
callValueBS[i],
optionData[i]
);
delta = fabs(callValueBS[i] - callValueGPU[i].Expected);
ref = callValueBS[i];
sumDelta += delta;
sumRef += fabs(ref);
if(delta > 1e-6) sumReserve += callValueGPU[i].Confidence / delta;
#ifdef PRINT_RESULTS
printf("BS: %f; delta: %E\n", callValueBS[i], delta);
#endif
}
sumReserve /= OPT_N;
printf("L1 norm : %E\n", sumDelta / sumRef);
printf("Average reserve: %f\n", sumReserve);
printf((sumReserve > 1.0f) ? "TEST PASSED\n" : "TEST FAILED.\n");
printf("Shutting down...\n");
cutilExit(argc, argv);
}
[root@node001-172 MonteCarloMultiGPU.long]#
| Article Details: | Views: |
|---|---|
| Last updated: 2010/07/15 |
|
| Autor: | |
|
|
