|
E:\study\delft\work\gasw\source\Cuda.targets(45,5): error MSB3721: 命令“echo "$(CUDA_BIN_PATH)\nvcc.exe" --opencc-options -LIST:source=on -ccbin "e:\IDEs\VS2010\VC\bin" -D_DEBUG -D_WIN32 -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"./" -I"../../common/inc" -I"../../../shared/inc" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -Xcompiler "/EHsc /W3 /nologo /Od /Zi /MDd " -maxrregcount=32 --ptxas-options=-v -gencode=arch=compute_13,code=\"sm_13,compute_13\" --compile -o "$(IntDir)\$(InputName).cu.obj" "E:\study\delft\work\gasw\source\gpu\main.cu"
1>E:\study\delft\work\gasw\source\Cuda.targets(45,5): error MSB3721: "$(CUDA_BIN_PATH)\nvcc.exe" --opencc-options -LIST:source=on -gencode=arch=compute_13,code=\"sm_13,compute_13\" -ccbin "e:\IDEs\VS2010\VC\bin" -D_DEBUG -D_WIN32 -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"./" -I"../../common/inc" -I"../../../shared/inc" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -Xcompiler "/EHsc /W3 /nologo /Od /Zi /MDd " -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -maxrregcount=32 --ptxas-options=-v -gencode=arch=compute_13,code=\"sm_13,compute_13\" --compile -o "$(IntDir)\$(InputName).cu.obj" "E:\study\delft\work\gasw\source\gpu\main.cu"”已退出,返回代码为 1。
1>
这是错误代码,error MSB3721 返回值为1.。。
在网上搜了好多解决方法,都没什么效果,环境变量配置应该没什么问题,sdk里面的示例程序都可以运行
代码是smith waterman的gpu加速实现,cu文件代码如下
- #include "stdafx.h"
- #include "main_cu.h"
- #include "gpudb.h"
- #include "fastafile.h"
- #include "main.h"
- //__constant__ queryType c_query[63*1024/sizeof(queryType)]; /**< Query sequence in constant memory */
- __constant__ int c_gapPenaltyTotal; //Gap + Extend penalties
- __constant__ int c_gapExtendPenalty; //Extend penalty
- __constant__ int c_queryLength; //Length of query sequence in chunks of 4
- texture t_queryProfile;
- /**
- Align 4 query profile entries with a database residue.
- */
- __device__ void alignResidues(scoreType &maxScore, scoreType4& left, int4& ixLeft, scoreType& top, scoreType& topLeft, int &IyTop, const char4 &substScore)
- {
- //q0
- ixLeft.x = max(0,max(left.x+c_gapPenaltyTotal,ixLeft.x+c_gapExtendPenalty)); //Max(0,...) here so IxColumn[] can be unsigned
- IyTop = max(top+c_gapPenaltyTotal,IyTop+c_gapExtendPenalty);
- int align = topLeft+substScore.x;
- topLeft=left.x;
- left.x = max(align,max(ixLeft.x,IyTop));
-
- //q1
- ixLeft.y = max(0,max(left.y+c_gapPenaltyTotal,ixLeft.y+c_gapExtendPenalty)); //Max(0,...) here so IxColumn[] can be unsigned
- IyTop = max(left.x+c_gapPenaltyTotal,IyTop+c_gapExtendPenalty);
- align = topLeft+substScore.y;
- topLeft=left.y;
- left.y = max(align,max(ixLeft.y,IyTop));
- //q2
- ixLeft.z = max(0,max(left.z+c_gapPenaltyTotal,ixLeft.z+c_gapExtendPenalty)); //Max(0,...) here so IxColumn[] can be unsigned
- IyTop = max(left.y+c_gapPenaltyTotal,IyTop+c_gapExtendPenalty);
- align = topLeft+substScore.z;
- topLeft=left.z;
- left.z = max(align,max(ixLeft.z,IyTop));
- //q3
- ixLeft.w = max(0,max(left.w+c_gapPenaltyTotal,ixLeft.w+c_gapExtendPenalty)); //Max(0,...) here so IxColumn[] can be unsigned
- IyTop = max(left.z+c_gapPenaltyTotal,IyTop+c_gapExtendPenalty);
- align = topLeft+substScore.w;
- left.w = max(align,max(ixLeft.w,IyTop));
- topLeft=top; //The next column is to the right of this one, so current top left becomes new top
- top = left.w; //Set top value for next query chunk
- maxScore = max(left.x,max(left.y,max(left.z,max(left.w,maxScore)))); //Update max score
- }
-
- /**
- Align a database sequence subblock with the entire query sequence.
- The loading/aligning with the query sequence in the 'inner' function as query sequence (constant) memory is much faster than the global memory in which the db resides.
- */
- __device__ inline void alignWithQuery(const seqType8 &s, int column, TempData2* tempColumn, scoreType &maxScore)
- {
-
- //Set the top related values to 0 as we're at the top of the matrix
- scoreType8 top = {0,0,0,0,0,0,0,0};
- scoreType topLeft = 0;
- int8 IyTop = {0,0,0,0,0,0,0,0};
- char4 substScores; //Query profile scores
- scoreType4 left;
- int4 ixLeft;
- for(int j=0;j> GPUdb::LOG2_BLOCK_SIZE;
- int groupNumInBlock = groupNum & (GPUdb::BLOCK_SIZE-1); //equals groupNum % GPUdb::BLOCK_SIZE
- int groupOffset = blockOffsets[seqBlock]+__umul24(groupNumInBlock,GPUdb::SUBBLOCK_SIZE);
- const GPUdb::seqType* group = &sequences[groupOffset];
-
-
- align(group,tempColumn,seqNum,scores); //Perform alignment
- groupNum+= gridDim.x*blockDim.x;
-
- }
- }
- // main routine that executes on the host
- bool launchSW(scoreType** scores, void* query, size_t queryLength, GPUdb& db, FastaMatrix& substitutionMatrix, int gapPenalty, int gapExtendPenalty, double& time)
- {
- /*
- //Prepare substitution matrix
- if(!substitutionMatrix.copyToGPU())
- {
- puts("Error uploading substitution matrix.");
- return false;
- }
- t_substMatrix.addressMode[0] = cudaAddressModeWrap;
- t_substMatrix.addressMode[1] = cudaAddressModeWrap;
- t_substMatrix.filterMode = cudaFilterModePoint;
- t_substMatrix.normalized = false;
- cudaBindTextureToArray(t_substMatrix,substitutionMatrix.getCudaArray(),substitutionMatrix.getChannelDesc());
- */
- //Prepare substitution matrix
-
- if(!substitutionMatrix.copyQueryProfileToGPU())
- {
- puts("Error uploading query profile.");
- return false;
- }
- t_queryProfile.addressMode[0] = cudaAddressModeWrap;
- t_queryProfile.addressMode[1] = cudaAddressModeWrap;
- t_queryProfile.filterMode = cudaFilterModePoint;
- t_queryProfile.normalized = false;
- cudaBindTextureToArray(t_queryProfile,substitutionMatrix.getQueryProfileCudaArray(),substitutionMatrix.getQueryProfileChannelDesc());
- //Prepare database
- if(!db.copyToGPU())
- {
- puts("Error uploading database.");
- return false;
- }
- //Prepare query
- /* if(cudaMemcpyToSymbol(c_query,query,queryLength,0,cudaMemcpyHostToDevice)!=cudaSuccess)
- {
- puts("Error copying query sequence.");
- return false;
- }*/
- size_t queryLengthInChunks = WHOLE_AMOUNT_OF(queryLength,sizeof(queryType));
- size_t queryLengthDiv2InChunks = WHOLE_AMOUNT_OF(queryLength/2,sizeof(queryType));
- if(cudaMemcpyToSymbol(c_queryLength,&queryLengthInChunks,sizeof(queryLengthInChunks),0,cudaMemcpyHostToDevice)!=cudaSuccess)
- {
- puts("Error copying query sequence length.");
- return false;
- }
- //Prepare penalties
- int gapPenaltyTotal = gapPenalty + gapExtendPenalty;
- if(cudaMemcpyToSymbol(c_gapPenaltyTotal,&gapPenaltyTotal,sizeof(gapPenaltyTotal),0,cudaMemcpyHostToDevice)!=cudaSuccess
- | cudaMemcpyToSymbol(c_gapExtendPenalty,&gapExtendPenalty,sizeof(gapExtendPenalty),0,cudaMemcpyHostToDevice)!=cudaSuccess)
- {
- puts("Error copying penalties.");
- return false;
- }
- //Prepare score array
- size_t scoreArraySize = sizeof(scoreType)*db.getNumSequences();
- if(cudaMallocHost(scores,scoreArraySize)!=cudaSuccess)
- {
- puts("Error allocating host score array.");
- return false;
- }
-
- scoreType* d_scores;
- if(cudaMalloc(&d_scores,scoreArraySize)!=cudaSuccess)
- {
- puts("Error allocating device score array: database too large?.");
- return false;
- }
- cudaMemset(d_scores,-1,scoreArraySize); //Set scores to -1 so we can check if they were actually all written by the kernel.
-
- int matrixSize = queryLengthDiv2InChunks*sizeof(queryType)*sizeof(TempData2); //Size of temporary storage for one thread
- //Determine launch configuration
- cudaDeviceProp props;
- cudaGetDeviceProperties(&props,0);
- size_t free, total;
- cudaMemGetInfo(&free,&total);
-
- if(db.getNumSequences() < 16)
- {
- puts("Database does not contain enough sequences to have a full thread block.");
- return false;
- }
-
-
- int blocks = props.multiProcessorCount*4;
- int threadsPerBlock = props.maxThreadsPerBlock/8; //Blocksize should be a multiple of 32 threads
- int blocksPerhw = (int)ceil((double)db.getNumBlocks()/(double)(blocks*threadsPerBlock/16));
- //threadsPerBlock=1;blocks=32;
- printf("Using %d blocks of %d threads: %d threads for %d sequences in %d blocks.\n",blocks,threadsPerBlock,blocks*threadsPerBlock,db.getNumSequences(),db.getNumBlocks());
- printf("Processing %d blocks per half warp.\n",blocksPerhw);
- //printf("Processing %d sequences per thread.\n",symbolsPerThread);
- //Prepare temporary score matrices: one F and one Ix column per thread
- TempData2* d_tempColumns;
- matrixSize *= blocks*threadsPerBlock;
-
- if(cudaMalloc(&d_tempColumns,matrixSize)!=cudaSuccess)
- {
- puts("Error allocating temporary matrix: too many threads for sequence size?");
- return false;
- }
- /*if(cudaMemset(d_tempColumns,0,matrixSize)!=cudaSuccess)
- {
- puts("Error allocating temporary matrix.");
- return false;
- }*/
- //Run kernel
- fflush(stdout);
- puts("Running...");
-
- clock_t start = clock();
- smithWaterman <<>>(db.getNumBlocks()*GPUdb::BLOCK_SIZE,d_scores,db.get_d_BlockOffsets(), db.get_d_SeqNums(),db.get_d_Sequences(), d_tempColumns);
- if(cudaThreadSynchronize()!=cudaSuccess)
- return false;
- clock_t stop = clock();
- time=(stop-start)/(double)CLOCKS_PER_SEC;
- //Get scores
- cudaMemcpy((void*) *scores,d_scores,scoreArraySize,cudaMemcpyDeviceToHost);
-
- cudaFree(d_tempColumns);
- cudaFree(d_scores);
- return true;
- }
复制代码 希望各位大大帮帮忙,这里先谢过
|
|