GPU2 in Linux via WINE

Moderators: slegrand, Site Moderators, PandeGroup

Re: GPU2 in Linux via WINE

Postby radaB » Tue Aug 05, 2008 3:14 pm

I had some issues and it took me some time but based on all these great posts I figured out how to make this work in Gentoo. Here's how I did it:

HowTo for Gentoo Linux x86-64 users: (maybe useful for others)

1. Compile and install CUDA and the thunked cudart.dll

1.1 Download the CUDA 32-bit toolkit from http://www.nvidia.com/object/thankyou_linux.html?url=/compute/cuda/2.0-Beta2/linux/toolkit/NVIDIA_CUDA_Toolkit_2.0beta2_Ubuntu7.10_x86.run. Install it by making it executable then running it:
Code: Select all
chmod +x NVIDIA_CUDA_Toolkit_2.0beta2_Ubuntu7.10_x86.run
./NVIDIA_CUDA_Toolkit_2.0beta2_Ubuntu7.10_x86.run


1.2 Make a new directory and put the following source files in it:
cudart.c
Code: Select all
#include <windows.h>
#include "cudart.h"
#include "wine/debug.h"

#if 0
#define __CUDA_INTERNAL_COMPILATION__
#include "crt/host_runtime.h"
#endif

#ifdef USE_SLEEPWAIT

#include <time.h>

static struct timespec sleepTime = { 0, USE_SLEEPWAIT };

#endif  /* USE_SLEEPWAIT */

WINE_DEFAULT_DEBUG_CHANNEL(cuda);

#define QUEUE_MAX       20
#define HALF_QUEUE_MAX  ((QUEUE_MAX)/2)

static unsigned int numQueued = 0;
static BOOL eventInitialized = FALSE;
static cudaEvent_t event;

static const char* cudaErrorString[] = {
    "cudaSuccess",
    "cudaErrorMissingConfiguration",
    "cudaErrorMemoryAllocation",
    "cudaErrorInitializationError",
    "cudaErrorLaunchFailure",
    "cudaErrorPriorLaunchFailure",
    "cudaErrorLaunchTimeout",
    "cudaErrorLaunchOutOfResources",
    "cudaErrorInvalidDeviceFunction",
    "cudaErrorInvalidConfiguration",
    "cudaErrorInvalidDevice",
    "cudaErrorInvalidValue",
    "cudaErrorInvalidPitchValue",
    "cudaErrorInvalidSymbol",
    "cudaErrorMapBufferObjectFailed",
    "cudaErrorUnmapBufferObjectFailed",
    "cudaErrorInvalidHostPointer",
    "cudaErrorInvalidDevicePointer",
    "cudaErrorInvalidTexture",
    "cudaErrorInvalidTextureBinding",
    "cudaErrorInvalidChannelDescriptor",
    "cudaErrorInvalidMemcpyDirection",
    "cudaErrorAddressOfConstant",
    "cudaErrorTextureFetchFailed",
    "cudaErrorTextureNotBound",
    "cudaErrorSynchronizationError",
    "cudaErrorInvalidFilterSetting",
    "cudaErrorInvalidNormSetting",
    "cudaErrorMixedDeviceExecution",
    "cudaErrorCudartUnloading",
    "cudaErrorUnknown",
    "cudaErrorNotYetImplemented",
    "cudaErrorMemoryValueTooLarge",
    "cudaErrorInvalidResourceHandle",
    "cudaErrorNotReady"
};

static const char* debug_cudaError(cudaError_t err) {
    if (cudaErrorStartupFailure == err) {
        return "cudaErrorStartupFailure";
    }

    if (cudaErrorApiFailureBase == err) {
        return "cudaErrorApiFailureBase";
    }

    if (err >= 0 && err < sizeof(cudaErrorString)/sizeof(cudaErrorString[0])) {
        return cudaErrorString[err];
    }

    WINE_TRACE("unknown error %d\n", err);
    return "unknown CUDA error";
}

BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpv)
{
    if (DLL_PROCESS_DETACH == fdwReason)
    {
        /* Cleanup */
        if (eventInitialized) {
            WINE_TRACE("releasing event %d\n", event);

            cudaError_t err = cudaEventDestroy(event);

            if (err) {
                WINE_TRACE("cudaEventDestroy: %s\n", debug_cudaError(err));
            }
        }
    }
}

cudaError_t WINAPI wine_cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem,
                                          cudaStream_t stream) {
    WINE_TRACE("(%d %d %d) (%d %d %d) %d %d\n", gridDim.x, gridDim.y, gridDim.z, blockDim.x,
               blockDim.y, blockDim.z, sharedMem, stream);

    cudaError_t err = cudaConfigureCall(gridDim, blockDim, sharedMem, stream);

    if (err) {
        WINE_TRACE("return %s\n", debug_cudaError(err));
    }

    return err;
}

cudaError_t WINAPI wine_cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
    WINE_TRACE("\n");
    return cudaGetDeviceProperties(prop, device);
}

const char* WINAPI wine_cudaGetErrorString(cudaError_t error) {
    WINE_TRACE("\n");
    return cudaGetErrorString(error);
}

cudaError_t WINAPI wine_cudaGetLastError() {
    WINE_TRACE("\n");

    cudaError_t err = cudaGetLastError();

    WINE_TRACE("return %s\n", debug_cudaError(err));

    return err;
}

cudaError_t WINAPI wine_cudaLaunch(const char *symbol) {
    WINE_TRACE("%p\n", symbol);

    if (QUEUE_MAX == numQueued) {
        cudaError_t evtErr;

        if (WINE_TRACE_ON(cuda)) {
            /* print out if event was recorded or not */
            WINE_TRACE("check event recorded %s\n", debug_cudaError(cudaEventQuery(event)));
        }

        /* wait for event */
#ifdef USE_SLEEPWAIT
        unsigned int sleepCount = 0;

        while (cudaEventQuery(event) != cudaSuccess) {
            nanosleep(&sleepTime, NULL);
            sleepCount++;
        }

        WINE_TRACE("slept %u times\n", sleepCount);
#else
        evtErr = cudaEventSynchronize(event);

        if (evtErr) {
            WINE_ERR("cudaEventSynchronize: %s\n", debug_cudaError(evtErr));
        }
#endif

        WINE_TRACE("event recorded, continuing\n");

        /* record a new event and subtract HALF_QUEUE_MAX from numQueued */
        numQueued = HALF_QUEUE_MAX;
        evtErr = cudaEventRecord(event, 0);

        if (evtErr) {
            WINE_ERR("cudaEventRecord: %s\n", debug_cudaError(evtErr));
        }
    }

    cudaError_t err = cudaLaunch(symbol);

    if (!eventInitialized) {
        /* Create an event on the first cudaLaunch call.  This is done here so the calling program
         * has a chance to select the GPU device with cudaSetDevice if desired. */
        cudaError_t evtErr = cudaEventCreate(&event);

        if (evtErr) {
            WINE_ERR("cudaEventCreate: %s\n", debug_cudaError(evtErr));
        }

        /* cudaEventCreate can return errors from previous asynchronous calls, so an error here does
         * not necessarily mean the event wasn't created.  Assume it was created for now. */
        eventInitialized = TRUE;
        WINE_TRACE("created event %d\n", event);
    }

    /* record an event at HALF_QUEUE_MAX */
    if (HALF_QUEUE_MAX == ++numQueued) {
        cudaError_t evtErr = cudaEventRecord(event, 0);  /* Assuming everything using stream 0 */

        if (evtErr) {
            WINE_ERR("cudaEventRecord: %s\n", debug_cudaError(evtErr));
        }
    }

    if (err) {
        WINE_TRACE("return %s\n", debug_cudaError(err));
    }

    return err;
}

cudaError_t WINAPI wine_cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
    WINE_TRACE("%p, %p, %d, %d\n", dst, src, count, kind);

    cudaError_t err = cudaMemcpy(dst, src, count, kind);

    if (err) {
        WINE_TRACE("return %s\n", debug_cudaError(err));
    }

    return err;
}

cudaError_t WINAPI wine_cudaMemcpyFromSymbol(void *dst, const char *symbol, size_t count, size_t offset,
                                             enum cudaMemcpyKind kind) {
    WINE_TRACE("\n");
    return cudaMemcpyFromSymbol(dst, symbol, count, offset, kind);
}

cudaError_t WINAPI wine_cudaMemcpyToSymbol(const char *symbol, const void *src, size_t count, size_t offset,
                                           enum cudaMemcpyKind kind) {
    WINE_TRACE("\n");
    return cudaMemcpyToSymbol(symbol, src, count, offset, kind);
}

void** WINAPI wine_cudaRegisterFatBinary(void *fatCubin) {
    WINE_TRACE("\n");
    return __cudaRegisterFatBinary(fatCubin);
}

void WINAPI wine_cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun,
                                      const char *deviceName, int thread_limit, uint3 *tid,
                                      uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize) {
    WINE_TRACE("\n");
    __cudaRegisterFunction(fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid, bid,
                           bDim, gDim, wSize);
}

void WINAPI wine_cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
                                 const char  *deviceName, int ext, int size, int constant,
                                 int global) {
    WINE_TRACE("\n");
    __cudaRegisterVar(fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global);
}

void WINAPI wine_cudaRegisterShared(void **fatCubinHandle, void **devicePtr) {
    WINE_TRACE("\n");
    __cudaRegisterShared(fatCubinHandle, devicePtr);
}

void WINAPI wine_cudaRegisterSharedVar(void **fatCubinHandle, void **devicePtr, size_t size,
                                    size_t alignment, int storage) {
    WINE_TRACE("\n");
    __cudaRegisterSharedVar(fatCubinHandle, devicePtr, size, alignment, storage);
}

cudaError_t WINAPI wine_cudaSetDevice(int device) {
    WINE_TRACE("\n");
    return cudaSetDevice(device);
}

void WINAPI wine_cudaUnregisterFatBinary(void **fatCubinHandle) {
    WINE_TRACE("\n");
    __cudaUnregisterFatBinary(fatCubinHandle);
}

cudaError_t WINAPI wine_cudaFree(void *devPtr) {
    WINE_TRACE("\n");
    return cudaFree(devPtr);
}

cudaError_t WINAPI wine_cudaMalloc(void **devPtr, size_t size) {
    WINE_TRACE("\n");
    return cudaMalloc(devPtr, size);
}


cudart.dll.spec
Code: Select all
    @ stdcall __cudaRegisterFatBinary(ptr) wine_cudaRegisterFatBinary
    @ stdcall __cudaRegisterFunction(ptr ptr ptr ptr long ptr ptr ptr ptr ptr) wine_cudaRegisterFunction
    @ stdcall __cudaRegisterVar(ptr ptr ptr ptr long long long long) wine_cudaRegisterVar
    @ stdcall __cudaRegisterShared(ptr ptr) wine_cudaRegisterShared
    @ stdcall __cudaRegisterSharedVar(ptr ptr long long long) wine_cudaRegisterSharedVar
    @ stdcall __cudaUnregisterFatBinary(ptr) wine_cudaUnregisterFatBinary
    @ stub cudaBindTexture
    @ stub cudaBindTextureToArray
    @ stub cudaChooseDevice
    @ stdcall cudaConfigureCall(ptr ptr long long) wine_cudaConfigureCall
    @ stub cudaCreateChannelDesc
    @ stub cudaD3D9Begin
    @ stub cudaD3D9End
    @ stub cudaD3D9GetDevice
    @ stub cudaD3D9GetDirect3DDevice
    @ stub cudaD3D9MapResources
    @ stub cudaD3D9MapVertexBuffer
    @ stub cudaD3D9RegisterResource
    @ stub cudaD3D9RegisterVertexBuffer
    @ stub cudaD3D9ResourceGetMappedPitch
    @ stub cudaD3D9ResourceGetMappedPointer
    @ stub cudaD3D9ResourceGetMappedSize
    @ stub cudaD3D9ResourceGetSurfaceDimensions
    @ stub cudaD3D9ResourceSetMapFlags
    @ stub cudaD3D9SetDirect3DDevice
    @ stub cudaD3D9UnmapResources
    @ stub cudaD3D9UnmapVertexBuffer
    @ stub cudaD3D9UnregisterResource
    @ stub cudaD3D9UnregisterVertexBuffer
    @ stub cudaEventCreate
    @ stub cudaEventDestroy
    @ stub cudaEventElapsedTime
    @ stub cudaEventQuery
    @ stub cudaEventRecord
    @ stub cudaEventSynchronize
    @ stdcall cudaFree(ptr) wine_cudaFree
    @ stub cudaFreeArray
    @ stub cudaFreeHost
    @ stub cudaGetChannelDesc
    @ stub cudaGetDevice
    @ stub cudaGetDeviceCount
    @ stdcall cudaGetDeviceProperties(ptr long) wine_cudaGetDeviceProperties
    @ stdcall cudaGetErrorString(long) wine_cudaGetErrorString
    @ stdcall cudaGetLastError() wine_cudaGetLastError
    @ stub cudaGetSymbolAddress
    @ stub cudaGetSymbolSize
    @ stub cudaGetTextureAlignmentOffset
    @ stub cudaGetTextureReference
    @ stub cudaGLMapBufferObject
    @ stub cudaGLRegisterBufferObject
    @ stub cudaGLSetGLDevice
    @ stub cudaGLUnmapBufferObject
    @ stub cudaGLUnregisterBufferObject
    @ stdcall cudaLaunch(ptr) wine_cudaLaunch
    @ stdcall cudaMalloc(ptr long) wine_cudaMalloc
    @ stub cudaMalloc3D
    @ stub cudaMalloc3DArray
    @ stub cudaMallocArray
    @ stub cudaMallocHost
    @ stub cudaMallocPitch
    @ stdcall cudaMemcpy(ptr ptr long long) wine_cudaMemcpy
    @ stub cudaMemcpy2D
    @ stub cudaMemcpy2DArrayToArray
    @ stub cudaMemcpy2DFromArray
    @ stub cudaMemcpy2DToArray
    @ stub cudaMemcpy3D
    @ stub cudaMemcpyArrayToArray
    @ stub cudaMemcpyFromArray
    @ stdcall cudaMemcpyFromSymbol(ptr ptr long long long) wine_cudaMemcpyFromSymbol
    @ stub cudaMemcpyToArray
    @ stdcall cudaMemcpyToSymbol(ptr ptr long long long) wine_cudaMemcpyToSymbol
    @ stub cudaMemset
    @ stub cudaMemset2D
    @ stub cudaMemset3D
    @ stub cudaRegisterFatBinary
    @ stdcall cudaSetDevice(long) wine_cudaSetDevice
    @ stub cudaSetupArgument
    @ stub cudaStreamCreate
    @ stub cudaStreamDestroy
    @ stub cudaStreamQuery
    @ stub cudaStreamSynchronize
    @ stub cudaThreadExit
    @ stub cudaThreadSynchronize
    @ stub cudaUnbindTexture


cudart.h
Code: Select all
    #include "cuda_runtime_api.h"

    void** __cudaRegisterFatBinary(void *fatCubin);
    void __cudaUnregisterFatBinary(void **fatCubinHandle);

    void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun,
                                const char *deviceName, int thread_limit, uint3 *tid,
                                uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize);

    void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
                           const char  *deviceName, int ext, int size, int constant,
                           int global);

    void __cudaRegisterShared(void **fatCubinHandle, void **devicePtr);

    void __cudaRegisterSharedVar(void **fatCubinHandle, void **devicePtr, size_t size,
                                 size_t alignment, int storage);


Makefile
Code: Select all
### Generated by Winemaker


SRCDIR                = .
SUBDIRS               =
DLLS                  = cudart.dll
EXES                  =



### Common settings

CEXTRA                =
CXXEXTRA              =
RCEXTRA               =
INCLUDE_PATH          = -I/usr/local/cuda/include
DLL_PATH              =
LIBRARY_PATH          = -L/usr/local/cuda/lib
LIBRARIES             = -lcudart
DEFINES               = -DUSE_SLEEPWAIT=300000    # 300 usecs seems to give reasonable results


### cudart.dll sources and settings

cudart_dll_MODULE     = cudart.dll
cudart_dll_C_SRCS     = cudart.c
cudart_dll_CXX_SRCS   =
cudart_dll_RC_SRCS    =
cudart_dll_LDFLAGS    = -shared \
         $(cudart_dll_MODULE:%=%.spec)
cudart_dll_DLL_PATH   =
cudart_dll_DLLS       = odbc32 \
         ole32 \
         oleaut32 \
         winspool
cudart_dll_LIBRARY_PATH=
cudart_dll_LIBRARIES  = uuid

cudart_dll_OBJS       = $(cudart_dll_C_SRCS:.c=.o) \
         $(cudart_dll_CXX_SRCS:.cpp=.o) \
         $(cudart_dll_RC_SRCS:.rc=.res)



### Global source lists

C_SRCS                = $(cudart_dll_C_SRCS)
CXX_SRCS              = $(cudart_dll_CXX_SRCS)
RC_SRCS               = $(cudart_dll_RC_SRCS)


### Tools

CC = winegcc -m32
CXX = wineg++ -m32
RC = wrc


### Generic targets

all: $(SUBDIRS) $(DLLS:%=%.so) $(EXES:%=%.so)

### Build rules

.PHONY: all clean dummy

$(SUBDIRS): dummy
   @cd $@ && $(MAKE)

# Implicit rules

.SUFFIXES: .cpp .rc .res
DEFINCL = $(INCLUDE_PATH) $(DEFINES) $(OPTIONS)

.c.o:
   $(CC) -c $(CFLAGS) $(CEXTRA) $(DEFINCL) -o $@ $<

.cpp.o:
   $(CXX) -c $(CXXFLAGS) $(CXXEXTRA) $(DEFINCL) -o $@ $<

.cxx.o:
   $(CXX) -c $(CXXFLAGS) $(CXXEXTRA) $(DEFINCL) -o $@ $<

.rc.res:
   $(RC) $(RCFLAGS) $(RCEXTRA) $(DEFINCL) -fo$@ $<

# Rules for cleaning

CLEAN_FILES     = y.tab.c y.tab.h lex.yy.c core *.orig *.rej \
                  \\\#*\\\# *~ *% .\\\#*

clean:: $(SUBDIRS:%=%/__clean__) $(EXTRASUBDIRS:%=%/__clean__)
   $(RM) $(CLEAN_FILES) $(RC_SRCS:.rc=.res) $(C_SRCS:.c=.o) $(CXX_SRCS:.cpp=.o)
   $(RM) $(DLLS:%=%.so) $(EXES:%=%.so) $(EXES:%.exe=%)

$(SUBDIRS:%=%/__clean__): dummy
   cd `dirname $@` && $(MAKE) clean

$(EXTRASUBDIRS:%=%/__clean__): dummy
   -cd `dirname $@` && $(RM) $(CLEAN_FILES)

### Target specific build rules
DEFLIB = $(LIBRARY_PATH) $(LIBRARIES) $(DLL_PATH)

$(cudart_dll_MODULE).so: $(cudart_dll_OBJS)
   $(CC) $(cudart_dll_LDFLAGS) -o $@ $(cudart_dll_OBJS) $(cudart_dll_LIBRARY_PATH) $(DEFLIB) $(cudart_dll_DLLS:%=-l%) $(cudart_dll_LIBRARIES:%=-l%)


1.3 Go into the directory and run make:
Code: Select all
make

You will likely get errors like:
Makefile:68: *** missing separator. Stop.

The errors are due to the code tags in the forum. To fix this, change the spaces at the beginning of the line for a tab, on every line you get this error.

1.4 Rename the newly compiled cudart.dll.so to nvcuda.dll and copy it to 'drive_c/windows/system32'
Code: Select all
cp -a cudart.dll.so ~/drive_c/windows/system32/nvcuda.dll


1.5 Copy the CUDA libraries to the lib32 folder:
Code: Select all
cp -a /usr/local/cuda/lib/* /usr/lib32


2. Download and install the F@H GPU Client

2.1 Goto http://www.stanford.edu/group/pandegroup/folding/release/Folding@home-Win32-GPU_XP-620.zip and get the XP console client. Unzip it to a folder.

2.2 Change into the directory you extracted the zip to and run
Code: Select all
nice -n18 wine folding@home-Win32-GPU.exe -forcegpu nvidia_g80


The GPU client should now be running. Thanks to actong, andromeda and everyone else who contributed.

EDIT: updated, no need to recompile wine.
Last edited by radaB on Sat Aug 16, 2008 6:33 pm, edited 1 time in total.
Image
radaB
 
Posts: 18
Joined: Tue Jul 29, 2008 6:01 pm

Re: GPU2 in Linux via WINE

Postby shatteredsilicon » Tue Aug 12, 2008 11:40 am

Guys, I need a hand with the next stage in this. :-)

I just got me a 9800GX2, and I'd like to fold under Linux on both GPUs in it. :-)

Has anyone got any advice on this matter WRT xorg.conf and WINE config?

Would it work to just add Adapters[1] (and Adapters[2], I still have my 8800GT) in the WINE config, and run the GPU client with -gpu 1, -gpu 2, etc.? How does this hook into X displays? I only have one monitor, so ideally I'd want the other 2 displays to load up at minimum res like 640x480 (to conserve memory) and just be used to fold?

If this is not likely to work, how do I get the 9800GX2 to work as 2 separate cards, and again, the secondary to be low-res? Or I could fire up separate X servers, so the secondary ones just run FAH via something like the xinit wrapper suggested earlier to reduce X lag?

Has anyone got any docs or thoughts on this? I can set up separate X devices for normal cards, but in lspci, the GX2 shows up as 1 device (one PCI ID), so I'm not sure how that would get referenced in a separate xorg.conf file.
Last edited by shatteredsilicon on Wed Aug 13, 2008 12:36 am, edited 1 time in total.
Image
1x Q6600 @ 3.2GHz, 4GB DDR3-1333
1x Phenom X4 9950 @ 2.6GHz, 4GB DDR2-1066
3x GeForce 9800GX2
1x GeForce 8800GT
CentOS 5 x86-64, WINE 1.x with CUDA wrappers
shatteredsilicon
 
Posts: 699
Joined: Tue Jul 08, 2008 2:27 pm

Re: GPU2 in Linux via WINE

Postby uncle_fungus » Tue Aug 12, 2008 11:45 am

Assuming that the Linux version of CUDA can access multiple CUDA devices even when only one monitor is attached, you may be able to make use of the -forcegpu flag.

Just starting the client with -gpu 1 most likely won't work because it won't be able to see a device, however if you also add -forcegpu nvidia_g80 it will override detection.
User avatar
uncle_fungus
Site Admin
 
Posts: 1639
Joined: Fri Nov 30, 2007 9:37 am
Location: Oxfordshire

Re: GPU2 in Linux via WINE

Postby shatteredsilicon » Tue Aug 12, 2008 11:54 am

Yes, but the patch for WINE only sets up Adapters[0] as an nVidia one. I'm not that familiar with WINE internals, but I'm assuming that this means the other devices aren't going to be available.

I'm also pretty sure X has to have a separate device defined for each Cuda device, and it is this definition that I'm looking for. Here is what lspci reports for my nVidia devices:

03:00.0 3D controller: nVidia Corporation Unknown device 0604 (rev a2)
04:00.0 VGA compatible controller: nVidia Corporation Unknown device 0604 (rev a2)
05:00.0 VGA compatible controller: nVidia Corporation GeForce 8800 GT (rev a2)

I can set BusID to "PCI:4:0:0" and that works fine, but with BusID "PCI:3:0:0" nothing comes up on the screen, on either port.
Is there a way to have each of the 2 GPUs drive 1 X instance? If so, then I can work around it with separate X servers and separate WINE instances. Not ideal (single X + single WINE instance would be more memory efficient), but I'd accept that as a solution for now.

[edit: I pulled the 8800GT because the total heat produced was getting silly.]
shatteredsilicon
 
Posts: 699
Joined: Tue Jul 08, 2008 2:27 pm

Re: GPU2 in Linux via WINE

Postby shatteredsilicon » Tue Aug 12, 2008 7:13 pm

OK, so I came up with a bit of a bodge that seems to _almost_ work.

The basic plan is to run a different Xorg for each F@H client, with each xorg.conf referencing a different GPU.

For the sake of the argument, I keep F@H installs under /opt/local/fah/gpu, and this is all based on RHEL/CentOS 5.

In /etc/X11/prefdm, add:

Code: Select all
su -m fah -c "/opt/local/fah/gpu/2/bin/x.sh 2>&1 > /dev/null &" &
sleep 20


Assuming you have a user called fah that does all the folding, and owns /opt/local/fah/*. The sleep is important (you may want to even up it a bit) because it gives the first X server time to start up and load F@H under WINE. Switching at a bad time can stall/crash the client.

In this case different GPU clients live under /opt/local/fah/gpu/X/ where X is the number of the instance (this can be picked arbitrarily and doesn't actually have to be a number).

/opt/local/fah/gpu/2/bin/x.sh:
Code: Select all
#!/bin/bash

taskset 02 xinit /bin/bash /opt/local/fah/gpu/2/bin/gpu.sh -- /usr/bin/Xorg :1 -config ./xorg.conf vt11 2>&1 > /dev/null


taskset 02 sets the mask so it only runs on CPU/core 2 (01 for 1, 04 for 3, 08 for 4, etc, it's a binary bit mask).

/opt/local/fah/gpu/2/bin/gpu.sh
Code: Select all
#!/bin/bash

pushd /opt/local/fah/gpu/2/.wine/drive_c/Program\ Files/Folding\@home/Folding\@home-gpu/
export WINEPREFIX=/opt/local/fah/gpu/2/.wine
taskset 02 nice -n 10 wine Folding\@home.exe 2>&1 > /dev/null
popd


This assumes that your wine instance for the client lives in /opt/local/fah/gpu/2/.wine. A separate instance should be used for each client.

The last important bit is xorg.conf. Put it in /opt/local/fah/gpu/$X/bin for each instance, and make sure you specify the different BusID for each GPU. Here's a fragment:

Code: Select all
Section "Device"
    Identifier     "Videocard0"
    Driver         "nvidia"
    VendorName     "nVidia"
    BoardName      "GeForce 9800GX2"
    BusID          "PCI:3:0:0"
EndSection


In my case, the two separate BusID entries are PCI:3:0:0 and PCI:4:0:0 for the two GPUs. Have one in each xorg.conf.

Note - the prefdm at the top assumes you run default run level 5 (X), so you only need to start up the F@H instances not on the primary X display you'll be using (start that one up yourself when you're logged in).

Now for the side-effects:

When one client is running, it is using (with the nanosleep() patch) 16% CPU, and crunches 1% per 90 seconds or so. However, when TWO instances are running, they each run at 8% CPU, even though they run on different CPUs (that's what taskset above is for), and it takes 4min:47sec for 1% of WU to be completed. This seems a little bizzare. Possible causes that spring to mind:

1) Despite each instance being bound to a separate GPU via separate Xorg instances, this isn't working and they are both binding to the same GPU after all, and the massive performance degradation comes from inefficient GPU task switching.

2) The nanosleep() patch somehow has both instances linked (shared memory for the dll or something?), and they bottleneck there. This might be supported by the observation that each instance uses up 8% of CPU while a single instance uses 16% of CPU. I will try reducing the sleep time to a half, and see of that helps.

Any thoughts or comments welcome.

[edit]
With sleep time changed to 150us, 1 client uses 18% CPU, 2 use 11% CPU each. 1% time is still identical at 4:47, which implies they are both, after all, running on the same GPU. Can anyone make a guess as to why they might both be running on the same GPU when each xorg.conf has a different BusID in the device section? I'm pretty sure WINE should be binding to the X server it is running under. Could this be a Cuda issue somewhere in the X configuration?
shatteredsilicon
 
Posts: 699
Joined: Tue Jul 08, 2008 2:27 pm

Re: GPU2 in Linux via WINE

Postby shatteredsilicon » Wed Aug 13, 2008 12:29 am

Made a little more progress on this. It turns out that using an alternate Xorg config file with the same name but in a different directory causes it to be ignored because the system Xorg config in /etc/X11/xorg.conf gets found earlier in the search path. So instead of calling the alternate config xorg.conf, call it something else instead, e.g. xorg.conf.1.

This now allows me to have one X server hanging off each GPU, at least as far as I can see from the logs, and the dummy one gets redirected to the 2nd output on the GX2 as well. If the screen type (not actually connected) is forced to CRT, it will default to 640x480 which should nicely minimize the memory wasted by the 2nd X server. Here is the device section for secondary GPU from xorg.conf.1:

Code: Select all
Section "Device"
    Identifier     "Videocard1"
    Driver         "nvidia"
    VendorName     "nVidia"
    BoardName      "GeForce 9800GX2"
    BusID          "PCI:3:0:0"
    Option         "MultiGPU" "0"
    Option         "SLI" "0"
    Option         "AllowDDCCI" "0"
    Option         "IgnoreEDID" "1"
    Option         "ConnectedMonitor" "CRT"
EndSection


There is still a problem, however - when 2 clients are running, the performance again degrades to about 4:41 per percent, which again implies that somehow, although WINEs are running in separate X instances bound to a separate GPUs, they are STILL both running on the same GPU and causing a performance degradation. My best guess/theory at the moment is that although there are two separate WINE run-time instances, the fact that they are both spawned by the same wine binary might be causing some confusion over shared libraries that might have already locked on to the first X instance.

I'm not sure how to proceed from here - any input from anyone more familiar with WINE and/or Cuda internals would be most welcome (actong, Shelnutt2 and andromeda, I'm looking at you ;-) ).

[edit]
Another bizzare observation - I changed the user the secondary F@H instance runs as, and if I do that, the performance drops through the floor, from 90s/% to 130s/%, with no other changes. Instance running as fah user on the 2nd X server on the 2nd GPU ends up using about 11% CPU (down from 16%) when I run an X instance (without F@H) as myself. But if the dummy instance also runs as me, this degradation doesn't occur. If I then fire up another instance as myself, the fah user instance goes down to 5%, and my own goes to 11% (each bound to a difference CPU and to a different X instance bound to a different GPU). At that point between the two F@H instances, they just about manage to get through 1% of a WU in 120s, but they are running quite unbalanced - one runs in about 100s/%, one over 150s/%. Changing the user the secondary instance runs as to the same user using the other X instances makes the performance of the 2nd instance when it runs alone go back to the correct level (90s/%). This seems really odd - there is clearly a context switch happening somewhere between different users which utterly hammers the performance down, but I'm not sure where to look for it, or how to work around it.
shatteredsilicon
 
Posts: 699
Joined: Tue Jul 08, 2008 2:27 pm

Re: GPU2 in Linux via WINE

Postby shatteredsilicon » Wed Aug 13, 2008 2:57 pm

A bit more progress:

By reducing the sleep time to 100us and upping the queue size to 100, I managed to get the 1% time down to 95 seconds when running the secondary instance as another user (normally 87-90 on the primary display). But when the primary instance gets started, the secondary goes down to 200s/% and primary does it in 164s/%. This gets the performance to, on average, 182s/2%=91s/%, which puts it to the same point as a single GPU working on it's own. I don't think this would be achievable if both clients were running on the same GPU due to overheads involved (and if they did manage to hook the GPU other than the one their X session is running on, I'd love to see how that is possible). So both GPUs _ARE_ being used. So the bottleneck appears to have something to do with the throttling in the cudart.dll wrapper. I'm guessing that both wrappers hook the same core library, which is where the cross-over happens.

I wonder if reducing the sleep time and upping the queue size further will help, or whether a more radical change (possibly removing the sleep alltogether) would be required.

[edit]
Even dropping the sleep time to 50us and upping the queue size to 256 doesn't seem to get past the wall of 180s/% on each instance (but both instances are now balanced - and each uses 50-55% of a CPU it's bound to!). This implies that both instances are running on the same GPU afterall. The temperature on one GPU is 84C compared to 78C on the other, which also seems to back up that theory. :-(

Hmm... Isn't there a wrapped function that sets the device to bind for cuda in cudart.c? Could this be hard-coded to a different device for each instance? Thinking about it, that might also remove the need to run a separate X server.
shatteredsilicon
 
Posts: 699
Joined: Tue Jul 08, 2008 2:27 pm

Re: GPU2 in Linux via WINE

Postby shatteredsilicon » Wed Aug 13, 2008 4:18 pm

CRACKED IT! :-D

It's a bodge - one that might be fixable better, but as a proof of concept this works! :-)
Secondary client: 128s/% @ 11%CPU
Primary client: 88s/% @ 16% CPU

I know that the performance of the 2nd client can be fixed by reducing the sleep time and/or upping the queue size, so both will hopefully soon be up to ~90s/%. :-)

Now - what I did!
In cudart.c, there is a wrapper function wine_cudaSetDevice that sets the Cuda device to be used by the application. This is presumably what the GPU client sets according to the -gpu X parameter, where X is a number between 0 and n where (n+1) is the number of available Cuda devices.

So, I left the primary cudart.c standard (i.e. selectable). But for the 2nd WINE instance (the one that runs on a dummy X server), I hard-coded this to always use device 1 (0 being the default). And it works with both GPUs being used! :-D

In cudart.c, line 309, replace this:

Code: Select all
//cudaError_t err = cudaSetDevice(device);
cudaError_t err = cudaSetDevice(1);


make clean; make;

and copy the new cudart.dll.so to cudart.dll in the secondary WINE location.
Note: You might also want to reduce the sleep time a bit and up the queue size to get the maximum performance from the secondary instance. Otherwise it'll run at about 2/3 speed.

What I'm thinking about now is that if this is set from the -gpu parameter, adding another entry in the Adapters[] array in WINE might just make this hard-code bodge unnecessary. With a bit of luck, it will just pass the parameter straight to this function, and it'll all just work without needing different cudart.dll wrappers for each instance. I might try that tomorrow.
shatteredsilicon
 
Posts: 699
Joined: Tue Jul 08, 2008 2:27 pm

Re: GPU2 in Linux via WINE

Postby uncle_fungus » Wed Aug 13, 2008 4:22 pm

I would try starting the second client with -gpu 1 -forcegpu nvidia_g80 and see what happens. As far as I can tell, the Adapters[] array in Wine is only being used for card detection since CUDA is handling (or perhaps mishandling) everything else.
-gpu 1 will force CUDA to use device 1 (instead of 0) and -forcegpu nvidia_g80 will bypass the client's hardware detection thus negating the need to alter Adapters[].

If the above method works it will probably allow you to run everything form one X and Wine instance...possibly.
User avatar
uncle_fungus
Site Admin
 
Posts: 1639
Joined: Fri Nov 30, 2007 9:37 am
Location: Oxfordshire

Re: GPU2 in Linux via WINE

Postby shatteredsilicon » Wed Aug 13, 2008 4:37 pm

Thanks for that, I'll try it. :-)
If it works, I wish I'd thought of that before I embarked on the epic mission of getting multiple X servers going only to find out that it's irrelevant and that what I needed to do was hack the cudart wrapper. To find out that even THAT is unnecessary would just be painful. ;-) But I hope it works nonetheless. :-)

[edit]
Many thanks, uncle_fungus! It works just like you said! :-D
And with both clients running on the same X screen there is no performance degradation, either. There is less CPU and memory usage, and more GPU usage - now both GPUs are churning out 1% in 87-89s! :-D

I'm feeling really stupid now for trying all the roundabout ways of getting this to work when the optimal solution was so incredibly simple!
shatteredsilicon
 
Posts: 699
Joined: Tue Jul 08, 2008 2:27 pm

Re: GPU2 in Linux via WINE

Postby shatteredsilicon » Fri Aug 15, 2008 8:51 am

I'm not sure if this is WINE or Cuda related, or whether the GX2 cards by design cannot handle being run constantly (without being OC-ed), but I'm seeing the following failure after a while on one of the GPU instances:

Code: Select all
[06:23:00] - Preparing to get new work unit...
[06:23:00] + Attempting to get work packet
[06:23:00] - Connecting to assignment server
[06:23:01] - Successful: assigned to (171.64.65.106).
[06:23:01] + News From Folding@Home: GPU folding beta
[06:23:01] Loaded queue successfully.
[06:23:02] + Closed connections
[06:23:02]
[06:23:02] + Processing work unit
[06:23:02] Core required: FahCore_11.exe
[06:23:02] Core found.
[06:23:02] Working on queue slot 06 [August 15 06:23:02 UTC]
[06:23:02] + Working ...
[06:23:03]
[06:23:03] *------------------------------*
[06:23:03] Folding@Home GPU Core - Beta
[06:23:03] Version 1.09 (Fri Aug 1 11:46:54 PDT 2008)
[06:23:03]
[06:23:03] Compiler  : Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 14.00.50727.762 for 80x86
[06:23:03] Build host: amoeba
[06:23:03] Board Type: Nvidia
[06:23:03] Core      :
[06:23:03] Preparing to commence simulation
[06:23:03] - Looking at optimizations...
[06:23:03] - Created dyn
[06:23:03] - Files status OK
[06:23:03] - Expanded 45437 -> 246249 (decompressed 541.9 percent)
[06:23:03] Called DecompressByteArray: compressed_data_size=45437 data_size=246249, decompressed_data_size=246249 diff=0
[06:23:03] - Digital signature verified
[06:23:03]
[06:23:03] Project: 5506 (Run 0, Clone 148, Gen 29)
[06:23:03]
[06:23:03] Assembly optimizations on if available.
[06:23:03] Entering M.D.
[06:23:09] Working on p5506_supervillin_e1
[06:23:10] Client config found, loading data.
[06:23:10] Starting GUI Server
[06:24:38] Completed 1%
[06:26:07] Completed 2%
[06:27:35] Completed 3%
[06:29:03] Completed 4%
[06:30:31] Completed 5%
[06:32:00] Completed 6%
[06:33:28] Completed 7%
[06:34:56] Completed 8%
[06:36:24] Completed 9%
[06:37:52] Completed 10%
[06:39:21] Completed 11%
[06:40:49] Completed 12%
[06:42:17] Completed 13%
[06:43:45] Completed 14%
[06:45:14] Completed 15%
[06:46:42] Completed 16%
[06:48:10] Completed 17%
[06:49:38] Completed 18%
[06:51:07] Completed 19%
[06:52:35] Completed 20%
[06:54:03] Completed 21%
[06:55:31] Completed 22%
[06:57:00] Completed 23%
[06:58:28] Completed 24%
[06:59:56] Completed 25%
[07:01:24] Completed 26%
[07:02:52] Completed 27%
[07:04:21] Completed 28%
[07:05:49] Completed 29%
[07:07:17] Completed 30%
[07:08:45] Completed 31%
[07:10:14] Completed 32%
[07:11:42] Completed 33%
[07:13:10] Completed 34%
[07:14:38] Completed 35%
[07:16:07] Completed 36%
[07:17:35] Completed 37%
[07:19:03] Completed 38%
[07:20:31] Completed 39%
[07:21:59] Completed 40%
[07:23:28] Completed 41%
[07:24:56] Completed 42%
[07:26:24] Completed 43%
[07:27:52] Completed 44%
[07:29:21] Completed 45%
[07:30:49] Completed 46%
[07:32:17] Completed 47%
[07:33:45] Completed 48%
[07:35:14] Completed 49%
[07:36:42] Completed 50%
[07:38:10] Completed 51%
[07:39:38] Completed 52%
[07:41:06] Completed 53%
[07:42:35] Completed 54%
[07:44:03] Completed 55%
[07:45:31] Completed 56%
[07:46:59] Completed 57%
[07:48:28] Completed 58%
[07:49:56] Completed 59%
[07:50:34] Gromacs cannot continue further.
[07:50:34] Going to send back what have done.
[07:50:35] logfile size: 79619 info=79619 bed=0 hdr=23
[07:50:35] - Writing 80155 bytes of core data to disk...
[07:50:35] Done: 79643 -> 9380 (compressed to 11.7 percent)
[07:50:35]   ... Done.
[07:50:35]
[07:50:35] Folding@home Core Shutdown: UNSTABLE_MACHINE
[07:50:39] CoreStatus = 7A (122)
[07:50:39] Sending work to server
[07:50:39] Project: 5506 (Run 0, Clone 148, Gen 29)
[07:50:39] - Read packet limit of 540015616... Set to 524286976.


[07:50:39] + Attempting to send results [August 15 07:50:39 UTC]
[07:50:40] + Results successfully sent
[07:50:40] Thank you for your contribution to Folding@Home.
[07:50:44] - Preparing to get new work unit...
[07:50:44] + Attempting to get work packet
[07:50:44] - Connecting to assignment server
[07:50:45] - Successful: assigned to (171.64.65.106).
[07:50:45] + News From Folding@Home: GPU folding beta
[07:50:45] Loaded queue successfully.
[07:50:47] + Closed connections
[07:50:52]
[07:50:52] + Processing work unit
[07:50:52] Core required: FahCore_11.exe
[07:50:52] Core found.
[07:50:52] Working on queue slot 07 [August 15 07:50:52 UTC]
[07:50:52] + Working ...
[07:50:52]
[07:50:52] *------------------------------*
[07:50:52] Folding@Home GPU Core - Beta
[07:50:52] Version 1.09 (Fri Aug 1 11:46:54 PDT 2008)
[07:50:52]
[07:50:52] Compiler  : Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 14.00.50727.762 for 80x86
[07:50:52] Build host: amoeba
[07:50:52] Board Type: Nvidia
[07:50:52] Core      :
[07:50:52] Preparing to commence simulation
[07:50:52] - Looking at optimizations...
[07:50:52] - Created dyn
[07:50:52] - Files status OK
[07:50:52] - Expanded 45484 -> 246249 (decompressed 541.3 percent)
[07:50:52] Called DecompressByteArray: compressed_data_size=45484 data_size=246249, decompressed_data_size=246249 diff=0
[07:50:52] - Digital signature verified
[07:50:52]
[07:50:52] Project: 5506 (Run 2, Clone 154, Gen 24)
[07:50:52]
[07:50:52] Assembly optimizations on if available.
[07:50:52] Entering M.D.
[07:50:58] Working on p5506_supervillin_e1
[07:50:58] Gromacs cannot continue further.
[07:50:58] Going to send back what have done.
[07:50:59] logfile size: 9573 info=9573 bed=0 hdr=23
[07:50:59] - Writing 10109 bytes of core data to disk...
[07:50:59] Done: 9597 -> 3405 (compressed to 35.4 percent)
[07:50:59]   ... Done.
[07:50:59]
[07:50:59] Folding@home Core Shutdown: UNSTABLE_MACHINE
[07:51:02] CoreStatus = 7A (122)
[07:51:02] Sending work to server
[07:51:02] Project: 5506 (Run 2, Clone 154, Gen 24)
[07:51:02] - Read packet limit of 540015616... Set to 524286976.


[07:51:02] + Attempting to send results [August 15 07:51:02 UTC]
[07:51:03] + Results successfully sent
[07:51:03] Thank you for your contribution to Folding@Home.
[07:51:07] - Preparing to get new work unit...
[07:51:07] + Attempting to get work packet
[07:51:07] - Connecting to assignment server
[07:51:07] - Successful: assigned to (171.64.65.106).
[07:51:07] + News From Folding@Home: GPU folding beta
[07:51:08] Loaded queue successfully.
[07:51:09] + Closed connections
[07:51:14]
[07:51:14] + Processing work unit
[07:51:14] Core required: FahCore_11.exe
[07:51:14] Core found.
[07:51:14] Working on queue slot 08 [August 15 07:51:14 UTC]
[07:51:14] + Working ...
[07:51:14]
[07:51:14] *------------------------------*
[07:51:14] Folding@Home GPU Core - Beta
[07:51:14] Version 1.09 (Fri Aug 1 11:46:54 PDT 2008)
[07:51:14]
[07:51:14] Compiler  : Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 14.00.50727.762 for 80x86
[07:51:14] Build host: amoeba
[07:51:14] Board Type: Nvidia
[07:51:14] Core      :
[07:51:14] Preparing to commence simulation
[07:51:14] - Looking at optimizations...
[07:51:14] - Created dyn
[07:51:14] - Files status OK
[07:51:14] - Expanded 45528 -> 246249 (decompressed 540.8 percent)
[07:51:14] Called DecompressByteArray: compressed_data_size=45528 data_size=246249, decompressed_data_size=246249 diff=0
[07:51:14] - Digital signature verified
[07:51:14]
[07:51:14] Project: 5506 (Run 7, Clone 159, Gen 23)
[07:51:14]
[07:51:14] Assembly optimizations on if available.
[07:51:14] Entering M.D.
[07:51:21] Working on p5506_supervillin_e1
[07:51:21] mdrun_gpu returned -1
[07:51:21] Going to send back what have done.
[07:51:22] logfile size: 9574 info=9574 bed=0 hdr=25
[07:51:22] - Writing 10112 bytes of core data to disk...
[07:51:22] Done: 9600 -> 3392 (compressed to 35.3 percent)
[07:51:22]   ... Done.
[07:51:22]
[07:51:22] Folding@home Core Shutdown: UNSTABLE_MACHINE
[07:51:24] CoreStatus = 7A (122)
[07:51:24] Sending work to server
[07:51:24] Project: 5506 (Run 7, Clone 159, Gen 23)
[07:51:24] - Read packet limit of 540015616... Set to 524286976.


[07:51:24] + Attempting to send results [August 15 07:51:24 UTC]
[07:51:25] + Results successfully sent
[07:51:25] Thank you for your contribution to Folding@Home.
[07:51:29] - Preparing to get new work unit...
[07:51:29] + Attempting to get work packet
[07:51:29] - Connecting to assignment server
[07:51:30] - Successful: assigned to (171.64.65.106).
[07:51:30] + News From Folding@Home: GPU folding beta
[07:51:30] Loaded queue successfully.
[07:51:31] + Closed connections
[07:51:36]
[07:51:36] + Processing work unit
[07:51:36] Core required: FahCore_11.exe
[07:51:36] Core found.
[07:51:36] Working on queue slot 09 [August 15 07:51:36 UTC]
[07:51:36] + Working ...
[07:51:36]
[07:51:36] *------------------------------*
[07:51:36] Folding@Home GPU Core - Beta
[07:51:36] Version 1.09 (Fri Aug 1 11:46:54 PDT 2008)
[07:51:36]
[07:51:36] Compiler  : Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 14.00.50727.762 for 80x86
[07:51:36] Build host: amoeba
[07:51:36] Board Type: Nvidia
[07:51:36] Core      :
[07:51:36] Preparing to commence simulation
[07:51:36] - Looking at optimizations...
[07:51:36] - Created dyn
[07:51:36] - Files status OK
[07:51:36] - Expanded 45499 -> 246249 (decompressed 541.2 percent)
[07:51:36] Called DecompressByteArray: compressed_data_size=45499 data_size=246249, decompressed_data_size=246249 diff=0
[07:51:36] - Digital signature verified
[07:51:36]
[07:51:36] Project: 5506 (Run 1, Clone 141, Gen 22)
[07:51:36]
[07:51:36] Assembly optimizations on if available.
[07:51:36] Entering M.D.
[07:51:43] Working on p5506_supervillin_e1
[07:51:43] mdrun_gpu returned -1
[07:51:43] Going to send back what have done.
[07:51:44] logfile size: 9573 info=9573 bed=0 hdr=25
[07:51:44] - Writing 10111 bytes of core data to disk...
[07:51:44] Done: 9599 -> 3394 (compressed to 35.3 percent)
[07:51:44]   ... Done.
[07:51:44]
[07:51:44] Folding@home Core Shutdown: UNSTABLE_MACHINE
[07:51:46] CoreStatus = 7A (122)
[07:51:46] Sending work to server
[07:51:46] Project: 5506 (Run 1, Clone 141, Gen 22)
[07:51:46] - Read packet limit of 540015616... Set to 524286976.


[07:51:46] + Attempting to send results [August 15 07:51:46 UTC]
[07:51:47] + Results successfully sent
[07:51:47] Thank you for your contribution to Folding@Home.
[07:51:51] - Preparing to get new work unit...
[07:51:51] + Attempting to get work packet
[07:51:51] - Connecting to assignment server
[07:51:52] - Successful: assigned to (171.64.65.106).
[07:51:52] + News From Folding@Home: GPU folding beta
[07:51:52] Loaded queue successfully.
[07:51:53] + Closed connections
[07:51:58]
[07:51:58] + Processing work unit
[07:51:58] Core required: FahCore_11.exe
[07:51:58] Core found.
[07:51:58] Working on queue slot 00 [August 15 07:51:58 UTC]
[07:51:58] + Working ...
[07:51:58]
[07:51:58] *------------------------------*
[07:51:58] Folding@Home GPU Core - Beta
[07:51:58] Version 1.09 (Fri Aug 1 11:46:54 PDT 2008)
[07:51:58]
[07:51:58] Compiler  : Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 14.00.50727.762 for 80x86
[07:51:58] Build host: amoeba
[07:51:58] Board Type: Nvidia
[07:51:58] Core      :
[07:51:58] Preparing to commence simulation
[07:51:58] - Looking at optimizations...
[07:51:58] - Created dyn
[07:51:58] - Files status OK
[07:51:58] - Expanded 45551 -> 246249 (decompressed 540.6 percent)
[07:51:58] Called DecompressByteArray: compressed_data_size=45551 data_size=246249, decompressed_data_size=246249 diff=0
[07:51:58] - Digital signature verified
[07:51:58]
[07:51:58] Project: 5506 (Run 6, Clone 184, Gen 22)
[07:51:58]
[07:51:58] Assembly optimizations on if available.
[07:51:58] Entering M.D.
[07:52:05] Working on p5506_supervillin_e1
[07:52:05] Gromacs cannot continue further.
[07:52:05] Going to send back what have done.
[07:52:06] logfile size: 9573 info=9573 bed=0 hdr=23
[07:52:06] - Writing 10109 bytes of core data to disk...
[07:52:06] Done: 9597 -> 3393 (compressed to 35.3 percent)
[07:52:06]   ... Done.
[07:52:06]
[07:52:06] Folding@home Core Shutdown: UNSTABLE_MACHINE
[07:52:09] CoreStatus = 7A (122)
[07:52:09] Sending work to server
[07:52:09] Project: 5506 (Run 6, Clone 184, Gen 22)
[07:52:09] - Read packet limit of 540015616... Set to 524286976.


[07:52:09] + Attempting to send results [August 15 07:52:09 UTC]
[07:52:09] + Results successfully sent
[07:52:09] Thank you for your contribution to Folding@Home.
[07:52:13] EUE limit exceeded. Pausing 24 hours.


I've seen it happen twice so far when left running overnight, both times on GPU 0. Even after a few hours (i.e. when I find it stopped) when the GPU has cooled down, it still will not resume when the instance is started up again. A reboot clears the issue.
shatteredsilicon
 
Posts: 699
Joined: Tue Jul 08, 2008 2:27 pm

Re: GPU2 in Linux via WINE

Postby Belboz99 » Thu Aug 21, 2008 12:35 pm

Hey! The 177.67 NVidia drivers for Linux are out!

And guess what? CPU usage for fah_core_11 is only using 3% of my CPU! (15% of one core) (Q6600).

Biggest thing is that I can fully use the system, and it actually performs better than Windows does when using the GPU client! :)


http://www.nvidia.com/object/cuda_get.html


Dan
Belboz99
 
Posts: 6
Joined: Thu Mar 06, 2008 4:43 am

Re: GPU2 in Linux via WINE

Postby shatteredsilicon » Thu Aug 21, 2008 1:42 pm

Is this with the throttled wrapper or without? With the throttled wrapper I get about 15-17% CPU usage anyway, with the old driver.

[edit]
Just updated the driver (177.67) and Cuda2 libraries, and I cannot see any performance difference. CPU usage is still the same, too, with the throttling Cuda wrapper dll at around 15-17%.
Last edited by shatteredsilicon on Thu Aug 21, 2008 3:19 pm, edited 1 time in total.
shatteredsilicon
 
Posts: 699
Joined: Tue Jul 08, 2008 2:27 pm

Re: GPU2 in Linux via WINE

Postby legoman666 » Thu Aug 21, 2008 2:08 pm

shatteredsilicon wrote:I'm not sure if this is WINE or Cuda related, or whether the GX2 cards by design cannot handle being run constantly (without being OC-ed), but I'm seeing the following failure after a while on one of the GPU instances:

Code: Select all
[06:23:00] - Preparing to get new work unit...
[06:23:00] + Attempting to get work packet
[06:23:00] - Connecting to assignment server
[06:23:01] - Successful: assigned to (171.64.65.106).
[06:23:01] + News From Folding@Home: GPU folding beta
[06:23:01] Loaded queue successfully.
[06:23:02] + Closed connections
[06:23:02]
[06:23:02] + Processing work unit
[06:23:02] Core required: FahCore_11.exe
[06:23:02] Core found.
[06:23:02] Working on queue slot 06 [August 15 06:23:02 UTC]
[06:23:02] + Working ...
[06:23:03]
[06:23:03] *------------------------------*
[06:23:03] Folding@Home GPU Core - Beta
[06:23:03] Version 1.09 (Fri Aug 1 11:46:54 PDT 2008)
[06:23:03]
[06:23:03] Compiler  : Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 14.00.50727.762 for 80x86
[06:23:03] Build host: amoeba
[06:23:03] Board Type: Nvidia
[06:23:03] Core      :
[06:23:03] Preparing to commence simulation
[06:23:03] - Looking at optimizations...
[06:23:03] - Created dyn
[06:23:03] - Files status OK
[06:23:03] - Expanded 45437 -> 246249 (decompressed 541.9 percent)
[06:23:03] Called DecompressByteArray: compressed_data_size=45437 data_size=246249, decompressed_data_size=246249 diff=0
[06:23:03] - Digital signature verified
[06:23:03]
[06:23:03] Project: 5506 (Run 0, Clone 148, Gen 29)
[06:23:03]
[06:23:03] Assembly optimizations on if available.
[06:23:03] Entering M.D.
[06:23:09] Working on p5506_supervillin_e1
[06:23:10] Client config found, loading data.
[06:23:10] Starting GUI Server
[06:24:38] Completed 1%
[06:26:07] Completed 2%
[06:27:35] Completed 3%
[06:29:03] Completed 4%
[06:30:31] Completed 5%
[06:32:00] Completed 6%
[06:33:28] Completed 7%
[06:34:56] Completed 8%
[06:36:24] Completed 9%
[06:37:52] Completed 10%
[06:39:21] Completed 11%
[06:40:49] Completed 12%
[06:42:17] Completed 13%
[06:43:45] Completed 14%
[06:45:14] Completed 15%
[06:46:42] Completed 16%
[06:48:10] Completed 17%
[06:49:38] Completed 18%
[06:51:07] Completed 19%
[06:52:35] Completed 20%
[06:54:03] Completed 21%
[06:55:31] Completed 22%
[06:57:00] Completed 23%
[06:58:28] Completed 24%
[06:59:56] Completed 25%
[07:01:24] Completed 26%
[07:02:52] Completed 27%
[07:04:21] Completed 28%
[07:05:49] Completed 29%
[07:07:17] Completed 30%
[07:08:45] Completed 31%
[07:10:14] Completed 32%
[07:11:42] Completed 33%
[07:13:10] Completed 34%
[07:14:38] Completed 35%
[07:16:07] Completed 36%
[07:17:35] Completed 37%
[07:19:03] Completed 38%
[07:20:31] Completed 39%
[07:21:59] Completed 40%
[07:23:28] Completed 41%
[07:24:56] Completed 42%
[07:26:24] Completed 43%
[07:27:52] Completed 44%
[07:29:21] Completed 45%
[07:30:49] Completed 46%
[07:32:17] Completed 47%
[07:33:45] Completed 48%
[07:35:14] Completed 49%
[07:36:42] Completed 50%
[07:38:10] Completed 51%
[07:39:38] Completed 52%
[07:41:06] Completed 53%
[07:42:35] Completed 54%
[07:44:03] Completed 55%
[07:45:31] Completed 56%
[07:46:59] Completed 57%
[07:48:28] Completed 58%
[07:49:56] Completed 59%
[07:50:34] Gromacs cannot continue further.
[07:50:34] Going to send back what have done.
[07:50:35] logfile size: 79619 info=79619 bed=0 hdr=23
[07:50:35] - Writing 80155 bytes of core data to disk...
[07:50:35] Done: 79643 -> 9380 (compressed to 11.7 percent)
[07:50:35]   ... Done.
[07:50:35]
[07:50:35] Folding@home Core Shutdown: UNSTABLE_MACHINE
[07:50:39] CoreStatus = 7A (122)
[07:50:39] Sending work to server
[07:50:39] Project: 5506 (Run 0, Clone 148, Gen 29)
[07:50:39] - Read packet limit of 540015616... Set to 524286976.


[07:50:39] + Attempting to send results [August 15 07:50:39 UTC]
[07:50:40] + Results successfully sent
[07:50:40] Thank you for your contribution to Folding@Home.
[07:50:44] - Preparing to get new work unit...
[07:50:44] + Attempting to get work packet
[07:50:44] - Connecting to assignment server
[07:50:45] - Successful: assigned to (171.64.65.106).
[07:50:45] + News From Folding@Home: GPU folding beta
[07:50:45] Loaded queue successfully.
[07:50:47] + Closed connections
[07:50:52]
[07:50:52] + Processing work unit
[07:50:52] Core required: FahCore_11.exe
[07:50:52] Core found.
[07:50:52] Working on queue slot 07 [August 15 07:50:52 UTC]
[07:50:52] + Working ...
[07:50:52]
[07:50:52] *------------------------------*
[07:50:52] Folding@Home GPU Core - Beta
[07:50:52] Version 1.09 (Fri Aug 1 11:46:54 PDT 2008)
[07:50:52]
[07:50:52] Compiler  : Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 14.00.50727.762 for 80x86
[07:50:52] Build host: amoeba
[07:50:52] Board Type: Nvidia
[07:50:52] Core      :
[07:50:52] Preparing to commence simulation
[07:50:52] - Looking at optimizations...
[07:50:52] - Created dyn
[07:50:52] - Files status OK
[07:50:52] - Expanded 45484 -> 246249 (decompressed 541.3 percent)
[07:50:52] Called DecompressByteArray: compressed_data_size=45484 data_size=246249, decompressed_data_size=246249 diff=0
[07:50:52] - Digital signature verified
[07:50:52]
[07:50:52] Project: 5506 (Run 2, Clone 154, Gen 24)
[07:50:52]
[07:50:52] Assembly optimizations on if available.
[07:50:52] Entering M.D.
[07:50:58] Working on p5506_supervillin_e1
[07:50:58] Gromacs cannot continue further.
[07:50:58] Going to send back what have done.
[07:50:59] logfile size: 9573 info=9573 bed=0 hdr=23
[07:50:59] - Writing 10109 bytes of core data to disk...
[07:50:59] Done: 9597 -> 3405 (compressed to 35.4 percent)
[07:50:59]   ... Done.
[07:50:59]
[07:50:59] Folding@home Core Shutdown: UNSTABLE_MACHINE
[07:51:02] CoreStatus = 7A (122)
[07:51:02] Sending work to server
[07:51:02] Project: 5506 (Run 2, Clone 154, Gen 24)
[07:51:02] - Read packet limit of 540015616... Set to 524286976.


[07:51:02] + Attempting to send results [August 15 07:51:02 UTC]
[07:51:03] + Results successfully sent
[07:51:03] Thank you for your contribution to Folding@Home.
[07:51:07] - Preparing to get new work unit...
[07:51:07] + Attempting to get work packet
[07:51:07] - Connecting to assignment server
[07:51:07] - Successful: assigned to (171.64.65.106).
[07:51:07] + News From Folding@Home: GPU folding beta
[07:51:08] Loaded queue successfully.
[07:51:09] + Closed connections
[07:51:14]
[07:51:14] + Processing work unit
[07:51:14] Core required: FahCore_11.exe
[07:51:14] Core found.
[07:51:14] Working on queue slot 08 [August 15 07:51:14 UTC]
[07:51:14] + Working ...
[07:51:14]
[07:51:14] *------------------------------*
[07:51:14] Folding@Home GPU Core - Beta
[07:51:14] Version 1.09 (Fri Aug 1 11:46:54 PDT 2008)
[07:51:14]
[07:51:14] Compiler  : Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 14.00.50727.762 for 80x86
[07:51:14] Build host: amoeba
[07:51:14] Board Type: Nvidia
[07:51:14] Core      :
[07:51:14] Preparing to commence simulation
[07:51:14] - Looking at optimizations...
[07:51:14] - Created dyn
[07:51:14] - Files status OK
[07:51:14] - Expanded 45528 -> 246249 (decompressed 540.8 percent)
[07:51:14] Called DecompressByteArray: compressed_data_size=45528 data_size=246249, decompressed_data_size=246249 diff=0
[07:51:14] - Digital signature verified
[07:51:14]
[07:51:14] Project: 5506 (Run 7, Clone 159, Gen 23)
[07:51:14]
[07:51:14] Assembly optimizations on if available.
[07:51:14] Entering M.D.
[07:51:21] Working on p5506_supervillin_e1
[07:51:21] mdrun_gpu returned -1
[07:51:21] Going to send back what have done.
[07:51:22] logfile size: 9574 info=9574 bed=0 hdr=25
[07:51:22] - Writing 10112 bytes of core data to disk...
[07:51:22] Done: 9600 -> 3392 (compressed to 35.3 percent)
[07:51:22]   ... Done.
[07:51:22]
[07:51:22] Folding@home Core Shutdown: UNSTABLE_MACHINE
[07:51:24] CoreStatus = 7A (122)
[07:51:24] Sending work to server
[07:51:24] Project: 5506 (Run 7, Clone 159, Gen 23)
[07:51:24] - Read packet limit of 540015616... Set to 524286976.


[07:51:24] + Attempting to send results [August 15 07:51:24 UTC]
[07:51:25] + Results successfully sent
[07:51:25] Thank you for your contribution to Folding@Home.
[07:51:29] - Preparing to get new work unit...
[07:51:29] + Attempting to get work packet
[07:51:29] - Connecting to assignment server
[07:51:30] - Successful: assigned to (171.64.65.106).
[07:51:30] + News From Folding@Home: GPU folding beta
[07:51:30] Loaded queue successfully.
[07:51:31] + Closed connections
[07:51:36]
[07:51:36] + Processing work unit
[07:51:36] Core required: FahCore_11.exe
[07:51:36] Core found.
[07:51:36] Working on queue slot 09 [August 15 07:51:36 UTC]
[07:51:36] + Working ...
[07:51:36]
[07:51:36] *------------------------------*
[07:51:36] Folding@Home GPU Core - Beta
[07:51:36] Version 1.09 (Fri Aug 1 11:46:54 PDT 2008)
[07:51:36]
[07:51:36] Compiler  : Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 14.00.50727.762 for 80x86
[07:51:36] Build host: amoeba
[07:51:36] Board Type: Nvidia
[07:51:36] Core      :
[07:51:36] Preparing to commence simulation
[07:51:36] - Looking at optimizations...
[07:51:36] - Created dyn
[07:51:36] - Files status OK
[07:51:36] - Expanded 45499 -> 246249 (decompressed 541.2 percent)
[07:51:36] Called DecompressByteArray: compressed_data_size=45499 data_size=246249, decompressed_data_size=246249 diff=0
[07:51:36] - Digital signature verified
[07:51:36]
[07:51:36] Project: 5506 (Run 1, Clone 141, Gen 22)
[07:51:36]
[07:51:36] Assembly optimizations on if available.
[07:51:36] Entering M.D.
[07:51:43] Working on p5506_supervillin_e1
[07:51:43] mdrun_gpu returned -1
[07:51:43] Going to send back what have done.
[07:51:44] logfile size: 9573 info=9573 bed=0 hdr=25
[07:51:44] - Writing 10111 bytes of core data to disk...
[07:51:44] Done: 9599 -> 3394 (compressed to 35.3 percent)
[07:51:44]   ... Done.
[07:51:44]
[07:51:44] Folding@home Core Shutdown: UNSTABLE_MACHINE
[07:51:46] CoreStatus = 7A (122)
[07:51:46] Sending work to server
[07:51:46] Project: 5506 (Run 1, Clone 141, Gen 22)
[07:51:46] - Read packet limit of 540015616... Set to 524286976.


[07:51:46] + Attempting to send results [August 15 07:51:46 UTC]
[07:51:47] + Results successfully sent
[07:51:47] Thank you for your contribution to Folding@Home.
[07:51:51] - Preparing to get new work unit...
[07:51:51] + Attempting to get work packet
[07:51:51] - Connecting to assignment server
[07:51:52] - Successful: assigned to (171.64.65.106).
[07:51:52] + News From Folding@Home: GPU folding beta
[07:51:52] Loaded queue successfully.
[07:51:53] + Closed connections
[07:51:58]
[07:51:58] + Processing work unit
[07:51:58] Core required: FahCore_11.exe
[07:51:58] Core found.
[07:51:58] Working on queue slot 00 [August 15 07:51:58 UTC]
[07:51:58] + Working ...
[07:51:58]
[07:51:58] *------------------------------*
[07:51:58] Folding@Home GPU Core - Beta
[07:51:58] Version 1.09 (Fri Aug 1 11:46:54 PDT 2008)
[07:51:58]
[07:51:58] Compiler  : Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 14.00.50727.762 for 80x86
[07:51:58] Build host: amoeba
[07:51:58] Board Type: Nvidia
[07:51:58] Core      :
[07:51:58] Preparing to commence simulation
[07:51:58] - Looking at optimizations...
[07:51:58] - Created dyn
[07:51:58] - Files status OK
[07:51:58] - Expanded 45551 -> 246249 (decompressed 540.6 percent)
[07:51:58] Called DecompressByteArray: compressed_data_size=45551 data_size=246249, decompressed_data_size=246249 diff=0
[07:51:58] - Digital signature verified
[07:51:58]
[07:51:58] Project: 5506 (Run 6, Clone 184, Gen 22)
[07:51:58]
[07:51:58] Assembly optimizations on if available.
[07:51:58] Entering M.D.
[07:52:05] Working on p5506_supervillin_e1
[07:52:05] Gromacs cannot continue further.
[07:52:05] Going to send back what have done.
[07:52:06] logfile size: 9573 info=9573 bed=0 hdr=23
[07:52:06] - Writing 10109 bytes of core data to disk...
[07:52:06] Done: 9597 -> 3393 (compressed to 35.3 percent)
[07:52:06]   ... Done.
[07:52:06]
[07:52:06] Folding@home Core Shutdown: UNSTABLE_MACHINE
[07:52:09] CoreStatus = 7A (122)
[07:52:09] Sending work to server
[07:52:09] Project: 5506 (Run 6, Clone 184, Gen 22)
[07:52:09] - Read packet limit of 540015616... Set to 524286976.


[07:52:09] + Attempting to send results [August 15 07:52:09 UTC]
[07:52:09] + Results successfully sent
[07:52:09] Thank you for your contribution to Folding@Home.
[07:52:13] EUE limit exceeded. Pausing 24 hours.


I've seen it happen twice so far when left running overnight, both times on GPU 0. Even after a few hours (i.e. when I find it stopped) when the GPU has cooled down, it still will not resume when the instance is started up again. A reboot clears the issue.



Maybe the nVidia Linux driver doesn't control the fan speeds as well as it should? Can you increase the fan speed manually and let it run overnight?
Image
legoman666
 
Posts: 297
Joined: Sat Dec 22, 2007 6:26 pm

Re: GPU2 in Linux via WINE

Postby shatteredsilicon » Thu Aug 21, 2008 3:24 pm

legoman666 wrote:Maybe the nVidia Linux driver doesn't control the fan speeds as well as it should? Can you increase the fan speed manually and let it run overnight?


The problem seems to have gone away on it's own. The machine is always running 24/7.

I'm not convinced it's a fan issue, there was always quite a lot of air blowing out of the card. And shouldn't the fan speed be controlled by the BIOS on the card, rather than the driver (unless BIOS is being overriden)?

Either way, the GX2 gets so hot I actually have to wait 5 minutes after powering the machine off before I can actually remove it. The whole casing seems to act like a heatsink, but the sensors report about 83-86C peak with folding.
shatteredsilicon
 
Posts: 699
Joined: Tue Jul 08, 2008 2:27 pm

PreviousNext

Return to unOfficial Linux GPU (WINE wrapper) (3rd party support)

Who is online

Users browsing this forum: No registered users and 0 guests