中文第一计算机图形学社区OpenGPU 版权所有2007-2018

 找回密码
 注册

扫一扫,访问微社区

搜索
查看: 6345|回复: 3

CUDA编译出错,求助各位大大,感激!

[复制链接]
发表于 2014-1-24 21:15:55 | 显示全部楼层 |阅读模式
E:\study\delft\work\gasw\source\Cuda.targets(45,5): error MSB3721: 命令“echo "$(CUDA_BIN_PATH)\nvcc.exe"  --opencc-options -LIST:source=on  -ccbin "e:\IDEs\VS2010\VC\bin"  -D_DEBUG -D_WIN32   -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"./" -I"../../common/inc" -I"../../../shared/inc" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -Xcompiler "/EHsc /W3 /nologo /Od /Zi   /MDd  " -maxrregcount=32 --ptxas-options=-v -gencode=arch=compute_13,code=\"sm_13,compute_13\"   --compile -o "$(IntDir)\$(InputName).cu.obj" "E:\study\delft\work\gasw\source\gpu\main.cu"

1>E:\study\delft\work\gasw\source\Cuda.targets(45,5): error MSB3721:  "$(CUDA_BIN_PATH)\nvcc.exe"  --opencc-options -LIST:source=on  -gencode=arch=compute_13,code=\"sm_13,compute_13\"    -ccbin "e:\IDEs\VS2010\VC\bin"  -D_DEBUG -D_WIN32   -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -I"./" -I"../../common/inc" -I"../../../shared/inc" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -Xcompiler "/EHsc /W3 /nologo /Od /Zi   /MDd  " -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\include" -maxrregcount=32 --ptxas-options=-v -gencode=arch=compute_13,code=\"sm_13,compute_13\"   --compile -o "$(IntDir)\$(InputName).cu.obj" "E:\study\delft\work\gasw\source\gpu\main.cu"”已退出,返回代码为 1。
1>

这是错误代码,error MSB3721 返回值为1.。。
在网上搜了好多解决方法,都没什么效果,环境变量配置应该没什么问题,sdk里面的示例程序都可以运行

代码是smith waterman的gpu加速实现,cu文件代码如下
  1. #include "stdafx.h"
  2. #include "main_cu.h"
  3. #include "gpudb.h"
  4. #include "fastafile.h"
  5. #include "main.h"

  6. //__constant__ queryType c_query[63*1024/sizeof(queryType)]; /**< Query sequence in constant memory */
  7. __constant__ int c_gapPenaltyTotal; //Gap + Extend penalties
  8. __constant__ int c_gapExtendPenalty; //Extend penalty
  9. __constant__ int c_queryLength; //Length of query sequence in chunks of 4

  10. texture t_queryProfile;

  11. /**
  12. Align 4 query profile entries with a database residue.
  13. */
  14. __device__  void alignResidues(scoreType &maxScore, scoreType4& left, int4& ixLeft, scoreType& top, scoreType& topLeft, int &IyTop, const char4 &substScore)
  15. {       
  16.         //q0
  17.         ixLeft.x = max(0,max(left.x+c_gapPenaltyTotal,ixLeft.x+c_gapExtendPenalty)); //Max(0,...) here so IxColumn[] can be unsigned
  18.         IyTop = max(top+c_gapPenaltyTotal,IyTop+c_gapExtendPenalty);
  19.         int align = topLeft+substScore.x;
  20.         topLeft=left.x;
  21.         left.x = max(align,max(ixLeft.x,IyTop));
  22.        
  23.         //q1
  24.         ixLeft.y = max(0,max(left.y+c_gapPenaltyTotal,ixLeft.y+c_gapExtendPenalty)); //Max(0,...) here so IxColumn[] can be unsigned
  25.         IyTop = max(left.x+c_gapPenaltyTotal,IyTop+c_gapExtendPenalty);
  26.         align = topLeft+substScore.y;
  27.         topLeft=left.y;
  28.         left.y = max(align,max(ixLeft.y,IyTop));

  29.         //q2
  30.         ixLeft.z = max(0,max(left.z+c_gapPenaltyTotal,ixLeft.z+c_gapExtendPenalty)); //Max(0,...) here so IxColumn[] can be unsigned
  31.         IyTop = max(left.y+c_gapPenaltyTotal,IyTop+c_gapExtendPenalty);
  32.         align = topLeft+substScore.z;
  33.         topLeft=left.z;
  34.         left.z = max(align,max(ixLeft.z,IyTop));

  35.         //q3
  36.         ixLeft.w = max(0,max(left.w+c_gapPenaltyTotal,ixLeft.w+c_gapExtendPenalty)); //Max(0,...) here so IxColumn[] can be unsigned
  37.         IyTop = max(left.z+c_gapPenaltyTotal,IyTop+c_gapExtendPenalty);
  38.         align = topLeft+substScore.w;       
  39.         left.w = max(align,max(ixLeft.w,IyTop));

  40.         topLeft=top; //The next column is to the right of this one, so current top left becomes new top
  41.         top = left.w; //Set top value for next query chunk
  42.         maxScore = max(left.x,max(left.y,max(left.z,max(left.w,maxScore)))); //Update max score
  43. }                       
  44.                        
  45. /**
  46. Align a database sequence subblock with the entire query sequence.
  47. The loading/aligning with the query sequence in the 'inner' function as query sequence (constant) memory is much faster than the global memory in which the db resides.
  48. */
  49. __device__ inline void alignWithQuery(const seqType8 &s, int column, TempData2* tempColumn, scoreType &maxScore)
  50. {
  51.                
  52.                 //Set the top related values to 0 as we're at the top of the matrix
  53.                 scoreType8 top = {0,0,0,0,0,0,0,0};
  54.                 scoreType topLeft = 0;
  55.                 int8 IyTop = {0,0,0,0,0,0,0,0};               

  56.                 char4 substScores; //Query profile scores
  57.                 scoreType4 left;
  58.                 int4 ixLeft;
  59.                 for(int j=0;j> GPUdb::LOG2_BLOCK_SIZE;
  60.                 int groupNumInBlock = groupNum & (GPUdb::BLOCK_SIZE-1); //equals groupNum % GPUdb::BLOCK_SIZE
  61.                 int groupOffset = blockOffsets[seqBlock]+__umul24(groupNumInBlock,GPUdb::SUBBLOCK_SIZE);
  62.                 const GPUdb::seqType* group = &sequences[groupOffset];
  63.        
  64.                                
  65.                 align(group,tempColumn,seqNum,scores); //Perform alignment
  66.                 groupNum+= gridDim.x*blockDim.x;
  67.        
  68.         }

  69. }

  70. // main routine that executes on the host
  71. bool launchSW(scoreType** scores, void* query, size_t queryLength, GPUdb& db, FastaMatrix& substitutionMatrix, int gapPenalty, int gapExtendPenalty, double& time)
  72. {
  73.         /*
  74.         //Prepare substitution matrix
  75.         if(!substitutionMatrix.copyToGPU())
  76.         {
  77.                 puts("Error uploading substitution matrix.");
  78.                 return false;
  79.         }
  80.         t_substMatrix.addressMode[0] = cudaAddressModeWrap;
  81.         t_substMatrix.addressMode[1] = cudaAddressModeWrap;
  82.         t_substMatrix.filterMode = cudaFilterModePoint;
  83.         t_substMatrix.normalized = false;
  84.         cudaBindTextureToArray(t_substMatrix,substitutionMatrix.getCudaArray(),substitutionMatrix.getChannelDesc());       
  85.         */

  86.         //Prepare substitution matrix
  87.        
  88.         if(!substitutionMatrix.copyQueryProfileToGPU())
  89.         {
  90.                 puts("Error uploading query profile.");
  91.                 return false;
  92.         }
  93.         t_queryProfile.addressMode[0] = cudaAddressModeWrap;
  94.         t_queryProfile.addressMode[1] = cudaAddressModeWrap;
  95.         t_queryProfile.filterMode = cudaFilterModePoint;
  96.         t_queryProfile.normalized = false;
  97.         cudaBindTextureToArray(t_queryProfile,substitutionMatrix.getQueryProfileCudaArray(),substitutionMatrix.getQueryProfileChannelDesc());       

  98.         //Prepare database
  99.         if(!db.copyToGPU())
  100.         {
  101.                 puts("Error uploading database.");
  102.                 return false;
  103.         }

  104.         //Prepare query
  105. /*        if(cudaMemcpyToSymbol(c_query,query,queryLength,0,cudaMemcpyHostToDevice)!=cudaSuccess)
  106.         {
  107.                 puts("Error copying query sequence.");
  108.                 return false;
  109.         }*/
  110.         size_t queryLengthInChunks = WHOLE_AMOUNT_OF(queryLength,sizeof(queryType));
  111.         size_t queryLengthDiv2InChunks = WHOLE_AMOUNT_OF(queryLength/2,sizeof(queryType));
  112.         if(cudaMemcpyToSymbol(c_queryLength,&queryLengthInChunks,sizeof(queryLengthInChunks),0,cudaMemcpyHostToDevice)!=cudaSuccess)
  113.         {
  114.                 puts("Error copying query sequence length.");
  115.                 return false;
  116.         }

  117.         //Prepare penalties
  118.         int gapPenaltyTotal = gapPenalty + gapExtendPenalty;
  119.         if(cudaMemcpyToSymbol(c_gapPenaltyTotal,&gapPenaltyTotal,sizeof(gapPenaltyTotal),0,cudaMemcpyHostToDevice)!=cudaSuccess
  120.         | cudaMemcpyToSymbol(c_gapExtendPenalty,&gapExtendPenalty,sizeof(gapExtendPenalty),0,cudaMemcpyHostToDevice)!=cudaSuccess)
  121.         {
  122.                 puts("Error copying penalties.");
  123.                 return false;
  124.         }

  125.         //Prepare score array
  126.         size_t scoreArraySize = sizeof(scoreType)*db.getNumSequences();
  127.         if(cudaMallocHost(scores,scoreArraySize)!=cudaSuccess)
  128.         {
  129.                 puts("Error allocating host score array.");
  130.                 return false;
  131.         }
  132.        
  133.         scoreType* d_scores;
  134.         if(cudaMalloc(&d_scores,scoreArraySize)!=cudaSuccess)
  135.         {
  136.                 puts("Error allocating device score array: database too large?.");
  137.                 return false;
  138.         }
  139.         cudaMemset(d_scores,-1,scoreArraySize); //Set scores to -1 so we can check if they were actually all written by the kernel.
  140.        

  141.         int matrixSize = queryLengthDiv2InChunks*sizeof(queryType)*sizeof(TempData2); //Size of temporary storage for one thread

  142.         //Determine launch configuration
  143.         cudaDeviceProp props;
  144.         cudaGetDeviceProperties(&props,0);
  145.         size_t free, total;
  146.         cudaMemGetInfo(&free,&total);
  147.                
  148.         if(db.getNumSequences() < 16)
  149.         {
  150.                 puts("Database does not contain enough sequences to have a full thread block.");
  151.                 return false;
  152.         }
  153.        
  154.                
  155.         int blocks = props.multiProcessorCount*4;
  156.         int threadsPerBlock = props.maxThreadsPerBlock/8; //Blocksize should be a multiple of 32 threads

  157.         int blocksPerhw = (int)ceil((double)db.getNumBlocks()/(double)(blocks*threadsPerBlock/16));



  158.         //threadsPerBlock=1;blocks=32;
  159.         printf("Using %d blocks of %d threads: %d threads for %d sequences in %d blocks.\n",blocks,threadsPerBlock,blocks*threadsPerBlock,db.getNumSequences(),db.getNumBlocks());
  160.         printf("Processing %d blocks per half warp.\n",blocksPerhw);
  161.         //printf("Processing %d sequences per thread.\n",symbolsPerThread);

  162.         //Prepare temporary score matrices: one F and one Ix column per thread
  163.         TempData2* d_tempColumns;
  164.         matrixSize *= blocks*threadsPerBlock;
  165.        
  166.         if(cudaMalloc(&d_tempColumns,matrixSize)!=cudaSuccess)
  167.         {
  168.                 puts("Error allocating temporary matrix: too many threads for sequence size?");
  169.                 return false;
  170.         }
  171.         /*if(cudaMemset(d_tempColumns,0,matrixSize)!=cudaSuccess)
  172.         {
  173.                 puts("Error allocating temporary matrix.");
  174.                 return false;
  175.         }*/

  176.         //Run kernel
  177.         fflush(stdout);
  178.         puts("Running...");

  179.        
  180.         clock_t start = clock();
  181.         smithWaterman <<>>(db.getNumBlocks()*GPUdb::BLOCK_SIZE,d_scores,db.get_d_BlockOffsets(),  db.get_d_SeqNums(),db.get_d_Sequences(), d_tempColumns);       

  182.         if(cudaThreadSynchronize()!=cudaSuccess)
  183.                 return false;

  184.         clock_t stop = clock();
  185.         time=(stop-start)/(double)CLOCKS_PER_SEC;

  186.         //Get scores
  187.         cudaMemcpy((void*) *scores,d_scores,scoreArraySize,cudaMemcpyDeviceToHost);
  188.        

  189.         cudaFree(d_tempColumns);       
  190.         cudaFree(d_scores);
  191.         return true;
  192. }
复制代码
希望各位大大帮帮忙,这里先谢过


 楼主| 发表于 2014-1-25 00:12:05 | 显示全部楼层
仔细看了一下好想是自己的GPU arch选错了-gencode=arch=compute_13,code=\"sm_13,compute_13\这个地方应该是compute_20但是自己在VS2010上的项目属性配置上已经更改,但是却毫无效果,不知道为什么
C:\Users\wdy\Desktop\QQ截图20140124171051.jpg

本帖子中包含更多资源

您需要 登录 才可以下载或查看,没有帐号?注册

x
发表于 2014-1-25 19:05:21 | 显示全部楼层
我都是直接用命令行编译的,好像不需要设置吧,是不是系统能自动搜索device?
发表于 2018-10-17 17:21:20 | 显示全部楼层
680就是个低功耗版本的过渡产品,当然不是想当然的那样越来越好。
您需要登录后才可以回帖 登录 | 注册

本版积分规则

QQ|关于我们|小黑屋|Archiver|手机版|中文第一计算机图形学社区OpenGPU

GMT+8, 2018-12-18 18:36 , Processed in 2.005223 second(s), 20 queries .

Powered by Discuz! X3.4

© 2001-2017 Comsenz Inc.

快速回复 返回顶部 返回列表