123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- //go:build linux || windows
- package gpu
- /*
- #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
- #cgo windows LDFLAGS: -lpthread
- #include "gpu_info.h"
- */
- import "C"
- import (
- "fmt"
- "log"
- "sync"
- "unsafe"
- "github.com/jmorganca/ollama/api"
- )
- type handles struct {
- cuda *C.cuda_handle_t
- rocm *C.rocm_handle_t
- }
- var gpuMutex sync.Mutex
- var gpuHandles *handles = nil
- // Note: gpuMutex must already be held
- func initGPUHandles() {
- // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
- log.Printf("Detecting GPU type")
- gpuHandles = &handles{nil, nil}
- var resp C.cuda_init_resp_t
- C.cuda_init(&resp)
- if resp.err != nil {
- log.Printf("CUDA not detected: %s", C.GoString(resp.err))
- C.free(unsafe.Pointer(resp.err))
- var resp C.rocm_init_resp_t
- C.rocm_init(&resp)
- if resp.err != nil {
- log.Printf("ROCm not detected: %s", C.GoString(resp.err))
- C.free(unsafe.Pointer(resp.err))
- } else {
- log.Printf("Radeon GPU detected")
- rocm := resp.rh
- gpuHandles.rocm = &rocm
- }
- } else {
- log.Printf("Nvidia GPU detected")
- cuda := resp.ch
- gpuHandles.cuda = &cuda
- }
- }
- func GetGPUInfo() GpuInfo {
- // TODO - consider exploring lspci (and equivalent on windows) to check for
- // GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
- gpuMutex.Lock()
- defer gpuMutex.Unlock()
- if gpuHandles == nil {
- initGPUHandles()
- }
- var memInfo C.mem_info_t
- resp := GpuInfo{"", 0, 0}
- if gpuHandles.cuda != nil {
- C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
- if memInfo.err != nil {
- log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
- C.free(unsafe.Pointer(memInfo.err))
- } else {
- resp.Driver = "CUDA"
- }
- } else if gpuHandles.rocm != nil {
- C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
- if memInfo.err != nil {
- log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))
- C.free(unsafe.Pointer(memInfo.err))
- } else {
- resp.Driver = "ROCM"
- }
- }
- if resp.Driver == "" {
- C.cpu_check_ram(&memInfo)
- resp.Driver = "CPU"
- }
- if memInfo.err != nil {
- log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
- C.free(unsafe.Pointer(memInfo.err))
- return resp
- }
- resp.FreeMemory = uint64(memInfo.free)
- resp.TotalMemory = uint64(memInfo.total)
- return resp
- }
- func CheckVRAM() (int64, error) {
- gpuInfo := GetGPUInfo()
- if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" {
- return int64(gpuInfo.FreeMemory), nil
- }
- return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
- }
- func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
- if opts.NumGPU != -1 {
- return opts.NumGPU
- }
- info := GetGPUInfo()
- if info.Driver == "CPU" {
- return 0
- }
- /*
- Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
- We can store the model weights and the kv cache in vram,
- to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
- */
- bytesPerLayer := uint64(fileSizeBytes / numLayer)
- // 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
- layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
- log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Driver, numLayer)
- return layers
- }
|