Update:
Here is the modified source for the wrapper (minimalistic (i.e. no unnecessary functionality implemented) with throttling):
cudart.c:
- Code: Select all
#include <windows.h>
#include "cudart.h"
#include "wine/debug.h"
#if 0
#define __CUDA_INTERNAL_COMPILATION__
#include "crt/host_runtime.h"
#endif
#ifdef USE_SLEEPWAIT
#include <time.h>
static struct timespec sleepTime = { 0, USE_SLEEPWAIT };
#endif /* USE_SLEEPWAIT */
WINE_DEFAULT_DEBUG_CHANNEL(cuda);
#define QUEUE_MAX 20
#define HALF_QUEUE_MAX ((QUEUE_MAX)/2)
static unsigned int numQueued = 0;
static BOOL eventInitialized = FALSE;
static cudaEvent_t event;
static const char* cudaErrorString[] = {
"cudaSuccess",
"cudaErrorMissingConfiguration",
"cudaErrorMemoryAllocation",
"cudaErrorInitializationError",
"cudaErrorLaunchFailure",
"cudaErrorPriorLaunchFailure",
"cudaErrorLaunchTimeout",
"cudaErrorLaunchOutOfResources",
"cudaErrorInvalidDeviceFunction",
"cudaErrorInvalidConfiguration",
"cudaErrorInvalidDevice",
"cudaErrorInvalidValue",
"cudaErrorInvalidPitchValue",
"cudaErrorInvalidSymbol",
"cudaErrorMapBufferObjectFailed",
"cudaErrorUnmapBufferObjectFailed",
"cudaErrorInvalidHostPointer",
"cudaErrorInvalidDevicePointer",
"cudaErrorInvalidTexture",
"cudaErrorInvalidTextureBinding",
"cudaErrorInvalidChannelDescriptor",
"cudaErrorInvalidMemcpyDirection",
"cudaErrorAddressOfConstant",
"cudaErrorTextureFetchFailed",
"cudaErrorTextureNotBound",
"cudaErrorSynchronizationError",
"cudaErrorInvalidFilterSetting",
"cudaErrorInvalidNormSetting",
"cudaErrorMixedDeviceExecution",
"cudaErrorCudartUnloading",
"cudaErrorUnknown",
"cudaErrorNotYetImplemented",
"cudaErrorMemoryValueTooLarge",
"cudaErrorInvalidResourceHandle",
"cudaErrorNotReady"
};
static const char* debug_cudaError(cudaError_t err) {
if (cudaErrorStartupFailure == err) {
return "cudaErrorStartupFailure";
}
if (cudaErrorApiFailureBase == err) {
return "cudaErrorApiFailureBase";
}
if (err >= 0 && err < sizeof(cudaErrorString)/sizeof(cudaErrorString[0])) {
return cudaErrorString[err];
}
WINE_TRACE("unknown error %d\n", err);
return "unknown CUDA error";
}
BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpv)
{
if (DLL_PROCESS_DETACH == fdwReason)
{
/* Cleanup */
if (eventInitialized) {
WINE_TRACE("releasing event %d\n", event);
cudaError_t err = cudaEventDestroy(event);
if (err) {
WINE_TRACE("cudaEventDestroy: %s\n", debug_cudaError(err));
}
}
}
}
cudaError_t WINAPI wine_cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem,
cudaStream_t stream) {
WINE_TRACE("(%d %d %d) (%d %d %d) %d %d\n", gridDim.x, gridDim.y, gridDim.z, blockDim.x,
blockDim.y, blockDim.z, sharedMem, stream);
cudaError_t err = cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
if (err) {
WINE_TRACE("return %s\n", debug_cudaError(err));
}
return err;
}
cudaError_t WINAPI wine_cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
WINE_TRACE("\n");
return cudaGetDeviceProperties(prop, device);
}
const char* WINAPI wine_cudaGetErrorString(cudaError_t error) {
WINE_TRACE("\n");
return cudaGetErrorString(error);
}
cudaError_t WINAPI wine_cudaGetLastError() {
WINE_TRACE("\n");
cudaError_t err = cudaGetLastError();
WINE_TRACE("return %s\n", debug_cudaError(err));
return err;
}
cudaError_t WINAPI wine_cudaLaunch(const char *symbol) {
WINE_TRACE("%p\n", symbol);
if (QUEUE_MAX == numQueued) {
cudaError_t evtErr;
if (WINE_TRACE_ON(cuda)) {
/* print out if event was recorded or not */
WINE_TRACE("check event recorded %s\n", debug_cudaError(cudaEventQuery(event)));
}
/* wait for event */
#ifdef USE_SLEEPWAIT
unsigned int sleepCount = 0;
while (cudaEventQuery(event) != cudaSuccess) {
nanosleep(&sleepTime, NULL);
sleepCount++;
}
WINE_TRACE("slept %u times\n", sleepCount);
#else
evtErr = cudaEventSynchronize(event);
if (evtErr) {
WINE_ERR("cudaEventSynchronize: %s\n", debug_cudaError(evtErr));
}
#endif
WINE_TRACE("event recorded, continuing\n");
/* record a new event and subtract HALF_QUEUE_MAX from numQueued */
numQueued = HALF_QUEUE_MAX;
evtErr = cudaEventRecord(event, 0);
if (evtErr) {
WINE_ERR("cudaEventRecord: %s\n", debug_cudaError(evtErr));
}
}
cudaError_t err = cudaLaunch(symbol);
if (!eventInitialized) {
/* Create an event on the first cudaLaunch call. This is done here so the calling program
* has a chance to select the GPU device with cudaSetDevice if desired. */
cudaError_t evtErr = cudaEventCreate(&event);
if (evtErr) {
WINE_ERR("cudaEventCreate: %s\n", debug_cudaError(evtErr));
}
/* cudaEventCreate can return errors from previous asynchronous calls, so an error here does
* not necessarily mean the event wasn't created. Assume it was created for now. */
eventInitialized = TRUE;
WINE_TRACE("created event %d\n", event);
}
/* record an event at HALF_QUEUE_MAX */
if (HALF_QUEUE_MAX == ++numQueued) {
cudaError_t evtErr = cudaEventRecord(event, 0); /* Assuming everything using stream 0 */
if (evtErr) {
WINE_ERR("cudaEventRecord: %s\n", debug_cudaError(evtErr));
}
}
if (err) {
WINE_TRACE("return %s\n", debug_cudaError(err));
}
return err;
}
cudaError_t WINAPI wine_cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
WINE_TRACE("%p, %p, %d, %d\n", dst, src, count, kind);
cudaError_t err = cudaMemcpy(dst, src, count, kind);
if (err) {
WINE_TRACE("return %s\n", debug_cudaError(err));
}
return err;
}
cudaError_t WINAPI wine_cudaMemcpyFromSymbol(void *dst, const char *symbol, size_t count, size_t offset,
enum cudaMemcpyKind kind) {
WINE_TRACE("\n");
return cudaMemcpyFromSymbol(dst, symbol, count, offset, kind);
}
cudaError_t WINAPI wine_cudaMemcpyToSymbol(const char *symbol, const void *src, size_t count, size_t offset,
enum cudaMemcpyKind kind) {
WINE_TRACE("\n");
return cudaMemcpyToSymbol(symbol, src, count, offset, kind);
}
void** WINAPI wine_cudaRegisterFatBinary(void *fatCubin) {
WINE_TRACE("\n");
return __cudaRegisterFatBinary(fatCubin);
}
void WINAPI wine_cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun,
const char *deviceName, int thread_limit, uint3 *tid,
uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize) {
WINE_TRACE("\n");
__cudaRegisterFunction(fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid, bid,
bDim, gDim, wSize);
}
void WINAPI wine_cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
const char *deviceName, int ext, int size, int constant,
int global) {
WINE_TRACE("\n");
__cudaRegisterVar(fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global);
}
void WINAPI wine_cudaRegisterShared(void **fatCubinHandle, void **devicePtr) {
WINE_TRACE("\n");
__cudaRegisterShared(fatCubinHandle, devicePtr);
}
void WINAPI wine_cudaRegisterSharedVar(void **fatCubinHandle, void **devicePtr, size_t size,
size_t alignment, int storage) {
WINE_TRACE("\n");
__cudaRegisterSharedVar(fatCubinHandle, devicePtr, size, alignment, storage);
}
cudaError_t WINAPI wine_cudaSetDevice(int device) {
WINE_TRACE("\n");
return cudaSetDevice(device);
}
cudaError_t WINAPI wine_cudaSetupArgument(void* arg, size_t count, size_t offset){
WINE_TRACE("\n");
return cudaSetupArgument(arg, count, offset);
}
cudaError_t WINAPI wine_cudaStreamQuery(cudaStream_t stream){
WINE_TRACE("\n");
return cudaStreamQuery(stream);
}
cudaError_t WINAPI wine_cudaThreadSynchronize(void){
WINE_TRACE("\n");
return cudaThreadSynchronize();
}
void WINAPI wine_cudaUnregisterFatBinary(void **fatCubinHandle) {
WINE_TRACE("\n");
__cudaUnregisterFatBinary(fatCubinHandle);
}
cudaError_t WINAPI wine_cudaFree(void *devPtr) {
WINE_TRACE("\n");
return cudaFree(devPtr);
}
cudaError_t WINAPI wine_cudaMalloc(void **devPtr, size_t size) {
WINE_TRACE("\n");
return cudaMalloc(devPtr, size);
}
cudart.h:
- Code: Select all
#include "cuda_runtime_api.h"
void** __cudaRegisterFatBinary(void *fatCubin);
void __cudaUnregisterFatBinary(void **fatCubinHandle);
void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun,
const char *deviceName, int thread_limit, uint3 *tid,
uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize);
void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
const char *deviceName, int ext, int size, int constant,
int global);
void __cudaRegisterShared(void **fatCubinHandle, void **devicePtr);
void __cudaRegisterSharedVar(void **fatCubinHandle, void **devicePtr, size_t size,
size_t alignment, int storage);
cudart.dll.spec:
- Code: Select all
@ stdcall __cudaRegisterFatBinary(ptr) wine_cudaRegisterFatBinary
@ stdcall __cudaRegisterFunction(ptr ptr ptr ptr long ptr ptr ptr ptr ptr) wine_cudaRegisterFunction
@ stdcall __cudaRegisterVar(ptr ptr ptr ptr long long long long) wine_cudaRegisterVar
@ stdcall __cudaRegisterShared(ptr ptr) wine_cudaRegisterShared
@ stdcall __cudaRegisterSharedVar(ptr ptr long long long) wine_cudaRegisterSharedVar
@ stdcall __cudaUnregisterFatBinary(ptr) wine_cudaUnregisterFatBinary
@ stub cudaBindTexture
@ stub cudaBindTextureToArray
@ stub cudaChooseDevice
@ stdcall cudaConfigureCall(ptr ptr long long) wine_cudaConfigureCall
@ stub cudaCreateChannelDesc
@ stub cudaD3D9Begin
@ stub cudaD3D9End
@ stub cudaD3D9GetDevice
@ stub cudaD3D9GetDirect3DDevice
@ stub cudaD3D9MapResources
@ stub cudaD3D9MapVertexBuffer
@ stub cudaD3D9RegisterResource
@ stub cudaD3D9RegisterVertexBuffer
@ stub cudaD3D9ResourceGetMappedPitch
@ stub cudaD3D9ResourceGetMappedPointer
@ stub cudaD3D9ResourceGetMappedSize
@ stub cudaD3D9ResourceGetSurfaceDimensions
@ stub cudaD3D9ResourceSetMapFlags
@ stub cudaD3D9SetDirect3DDevice
@ stub cudaD3D9UnmapResources
@ stub cudaD3D9UnmapVertexBuffer
@ stub cudaD3D9UnregisterResource
@ stub cudaD3D9UnregisterVertexBuffer
@ stub cudaEventCreate
@ stub cudaEventDestroy
@ stub cudaEventElapsedTime
@ stub cudaEventQuery
@ stub cudaEventRecord
@ stub cudaEventSynchronize
@ stdcall cudaFree(ptr) wine_cudaFree
@ stub cudaFreeArray
@ stub cudaFreeHost
@ stub cudaGetChannelDesc
@ stub cudaGetDevice
@ stub cudaGetDeviceCount
@ stdcall cudaGetDeviceProperties(ptr long) wine_cudaGetDeviceProperties
@ stdcall cudaGetErrorString(long) wine_cudaGetErrorString
@ stdcall cudaGetLastError() wine_cudaGetLastError
@ stub cudaGetSymbolAddress
@ stub cudaGetSymbolSize
@ stub cudaGetTextureAlignmentOffset
@ stub cudaGetTextureReference
@ stub cudaGLMapBufferObject
@ stub cudaGLRegisterBufferObject
@ stub cudaGLSetGLDevice
@ stub cudaGLUnmapBufferObject
@ stub cudaGLUnregisterBufferObject
@ stdcall cudaLaunch(ptr) wine_cudaLaunch
@ stdcall cudaMalloc(ptr long) wine_cudaMalloc
@ stub cudaMalloc3D
@ stub cudaMalloc3DArray
@ stub cudaMallocArray
@ stub cudaMallocHost
@ stub cudaMallocPitch
@ stdcall cudaMemcpy(ptr ptr long long) wine_cudaMemcpy
@ stub cudaMemcpy2D
@ stub cudaMemcpy2DArrayToArray
@ stub cudaMemcpy2DFromArray
@ stub cudaMemcpy2DToArray
@ stub cudaMemcpy3D
@ stub cudaMemcpyArrayToArray
@ stub cudaMemcpyFromArray
@ stdcall cudaMemcpyFromSymbol(ptr ptr long long long) wine_cudaMemcpyFromSymbol
@ stub cudaMemcpyToArray
@ stdcall cudaMemcpyToSymbol(ptr ptr long long long) wine_cudaMemcpyToSymbol
@ stub cudaMemset
@ stub cudaMemset2D
@ stub cudaMemset3D
@ stub cudaRegisterFatBinary
@ stdcall cudaSetDevice(long) wine_cudaSetDevice
@ stdcall cudaSetupArgument(ptr long long) wine_cudaSetupArgument
@ stub cudaStreamCreate
@ stub cudaStreamDestroy
@ stdcall cudaStreamQuery(ptr) wine_cudaStreamQuery
@ stub cudaStreamSynchronize
@ stub cudaThreadExit
@ stdcall cudaThreadSynchronize() wine_cudaThreadSynchronize
@ stub cudaUnbindTexture
Makefile:
- Code: Select all
### Generated by Winemaker
SRCDIR = .
SUBDIRS =
DLLS = cudart.dll
EXES =
### Common settings
CEXTRA =
CXXEXTRA =
RCEXTRA =
INCLUDE_PATH = -I/usr/local/cuda/include
DLL_PATH =
LIBRARY_PATH = -L/usr/local/cuda/lib
LIBRARIES = -lcudart
DEFINES = -DUSE_SLEEPWAIT=300000 # 300 usecs seems to give reasonable results
### cudart.dll sources and settings
cudart_dll_MODULE = cudart.dll
cudart_dll_C_SRCS = cudart.c
cudart_dll_CXX_SRCS =
cudart_dll_RC_SRCS =
cudart_dll_LDFLAGS = -shared \
$(cudart_dll_MODULE:%=%.spec)
cudart_dll_DLL_PATH =
cudart_dll_DLLS = odbc32 \
ole32 \
oleaut32 \
winspool
cudart_dll_LIBRARY_PATH=
cudart_dll_LIBRARIES = uuid
cudart_dll_OBJS = $(cudart_dll_C_SRCS:.c=.o) \
$(cudart_dll_CXX_SRCS:.cpp=.o) \
$(cudart_dll_RC_SRCS:.rc=.res)
### Global source lists
C_SRCS = $(cudart_dll_C_SRCS)
CXX_SRCS = $(cudart_dll_CXX_SRCS)
RC_SRCS = $(cudart_dll_RC_SRCS)
### Tools
CC = winegcc -m32
CXX = wineg++ -m32
RC = wrc
### Generic targets
all: $(SUBDIRS) $(DLLS:%=%.so) $(EXES:%=%.so)
### Build rules
.PHONY: all clean dummy
$(SUBDIRS): dummy
@cd $@ && $(MAKE)
# Implicit rules
.SUFFIXES: .cpp .rc .res
DEFINCL = $(INCLUDE_PATH) $(DEFINES) $(OPTIONS)
.c.o:
$(CC) -c $(CFLAGS) $(CEXTRA) $(DEFINCL) -o $@ $<
.cpp.o:
$(CXX) -c $(CXXFLAGS) $(CXXEXTRA) $(DEFINCL) -o $@ $<
.cxx.o:
$(CXX) -c $(CXXFLAGS) $(CXXEXTRA) $(DEFINCL) -o $@ $<
.rc.res:
$(RC) $(RCFLAGS) $(RCEXTRA) $(DEFINCL) -fo$@ $<
# Rules for cleaning
CLEAN_FILES = y.tab.c y.tab.h lex.yy.c core *.orig *.rej \
\\\#*\\\# *~ *% .\\\#*
clean:: $(SUBDIRS:%=%/__clean__) $(EXTRASUBDIRS:%=%/__clean__)
$(RM) $(CLEAN_FILES) $(RC_SRCS:.rc=.res) $(C_SRCS:.c=.o) $(CXX_SRCS:.cpp=.o)
$(RM) $(DLLS:%=%.so) $(EXES:%=%.so) $(EXES:%.exe=%)
$(SUBDIRS:%=%/__clean__): dummy
cd `dirname $@` && $(MAKE) clean
$(EXTRASUBDIRS:%=%/__clean__): dummy
-cd `dirname $@` && $(RM) $(CLEAN_FILES)
### Target specific build rules
DEFLIB = $(LIBRARY_PATH) $(LIBRARIES) $(DLL_PATH)
$(cudart_dll_MODULE).so: $(cudart_dll_OBJS)
$(CC) $(cudart_dll_LDFLAGS) -o $@ $(cudart_dll_OBJS) $(cudart_dll_LIBRARY_PATH) $(DEFLIB) $(cudart_dll_DLLS:%=-l%) $(cudart_dll_LIBRARIES:%=-l%)