1 month ago · b27e8f3f10
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -44,13 +44,13 @@ type Backend struct {
 
				 	tensors map[string]*C.struct_ggml_tensor
			
 
				 
			
 
				 	// input is the backend used for inputs
			
 
				-	input *C.struct_ggml_backend
			
 
				+	input *C.struct_ggml_backend_buffer_type
			
 
				 
			
 
				 	// output is the backend used for outputs
			
 
				-	output *C.struct_ggml_backend
			
 
				+	output *C.struct_ggml_backend_buffer_type
			
 
				 
			
 
				 	// layers is the backend used for repeating layers
			
 
				-	layers map[int]*C.struct_ggml_backend
			
 
				+	layers map[int]*C.struct_ggml_backend_buffer_type
			
 
				 
			
 
				 	flashAttention bool
			
 
				 
			
@@ -83,7 +83,10 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 
				 	for _, d := range devices() {
			
 
				 		switch C.ggml_backend_dev_type(d) {
			
 
				 		case C.GGML_BACKEND_DEVICE_TYPE_CPU:
			
 
				-			cpus = append(cpus, d)
			
 
				+			if len(cpus) == 0 {
			
 
				+				// only the first cpu device should be used
			
 
				+				cpus = append(cpus, d)
			
 
				+			}
			
 
				 		case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			
 
				 			accels = append(accels, d)
			
 
				 		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
			
@@ -324,25 +327,25 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 
				 		return nil, err
			
 
				 	}
			
 
				 
			
 
				-	// map devices to backends so tensors created post initialization can be assigned to the correct device
			
 
				-	deviceBackends := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend)
			
 
				+	// map devices to backend buffer types so new tensors can be assigned to the correct device
			
 
				+	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
			
 
				 
			
 
				 	// create backends and buffer types used for the compute graph scheduler
			
 
				 	var schedBackends []*C.struct_ggml_backend
			
 
				 	var schedBufts []*C.struct_ggml_backend_buffer_type
			
 
				 	for _, d := range append(gpus, append(accels, cpus...)...) {
			
 
				 		b := C.ggml_backend_dev_init(d, nil)
			
 
				-		schedBackends = append(schedBackends, b)
			
 
				-		deviceBackends[d] = b
			
 
				-
			
 
				 		bt := C.ggml_backend_get_default_buffer_type(b)
			
 
				-		// use the first gpu host buffer type for gpu if possible
			
 
				 		if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
			
 
				-			if hbt := C.ggml_backend_dev_host_buffer_type(d); hbt != nil {
			
 
				+			// use the first gpu host buffer type for gpu if possible
			
 
				+			if hbt := C.ggml_backend_dev_host_buffer_type(gpus[0]); hbt != nil {
			
 
				 				bt = hbt
			
 
				 			}
			
 
				 		}
			
 
				 
			
 
				+		deviceBufferTypes[d] = bt
			
 
				+
			
 
				+		schedBackends = append(schedBackends, b)
			
 
				 		schedBufts = append(schedBufts, bt)
			
 
				 
			
 
				 		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
			
@@ -365,12 +368,12 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
 
				 			C.size_t(maxGraphNodes),
			
 
				 			true,
			
 
				 		),
			
 
				-		input:  deviceBackends[input.d],
			
 
				-		output: deviceBackends[output.d],
			
 
				-		layers: func() map[int]*C.struct_ggml_backend {
			
 
				-			m := make(map[int]*C.struct_ggml_backend)
			
 
				+		input:  deviceBufferTypes[input.d],
			
 
				+		output: deviceBufferTypes[output.d],
			
 
				+		layers: func() map[int]*C.struct_ggml_backend_buffer_type {
			
 
				+			m := make(map[int]*C.struct_ggml_backend_buffer_type)
			
 
				 			for i, layer := range layers {
			
 
				-				m[i] = deviceBackends[layer.d]
			
 
				+				m[i] = deviceBufferTypes[layer.d]
			
 
				 			}
			
 
				 			return m
			
 
				 		}(),
			
@@ -401,13 +404,12 @@ func (b *Backend) NewContext() ml.Context {
 
				 func (b *Backend) NewContextSize(n int) ml.Context {
			
 
				 	n = min(n, b.maxGraphNodes)
			
 
				 	return &Context{
			
 
				-		b: b,
			
 
				+		b:             b,
			
 
				+		maxGraphNodes: n,
			
 
				 		ctx: C.ggml_init(C.struct_ggml_init_params{
			
 
				 			mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
			
 
				 			no_alloc: true,
			
 
				 		}),
			
 
				-		backend:       C.ggml_backend_sched_get_backend(b.sched, 0),
			
 
				-		maxGraphNodes: n,
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -425,8 +427,8 @@ type Context struct {
 
				 	ctx   *C.struct_ggml_context
			
 
				 	graph *C.struct_ggml_cgraph
			
 
				 
			
 
				-	// backend is the backend used for new tensors
			
 
				-	backend *C.struct_ggml_backend
			
 
				+	// buft is the buffer type used for new tensors
			
 
				+	buft *C.struct_ggml_backend_buffer_type
			
 
				 
			
 
				 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
			
 
				 	maxGraphNodes int
			
@@ -437,7 +439,7 @@ func (c Context) Input() ml.Context {
 
				 		return &Context{
			
 
				 			b:             c.b,
			
 
				 			ctx:           c.ctx,
			
 
				-			backend:       c.b.input,
			
 
				+			buft:          c.b.input,
			
 
				 			maxGraphNodes: c.maxGraphNodes,
			
 
				 		}
			
 
				 	}
			
@@ -450,7 +452,7 @@ func (c Context) Output() ml.Context {
 
				 		return &Context{
			
 
				 			b:             c.b,
			
 
				 			ctx:           c.ctx,
			
 
				-			backend:       c.b.output,
			
 
				+			buft:          c.b.output,
			
 
				 			maxGraphNodes: c.maxGraphNodes,
			
 
				 		}
			
 
				 	}
			
@@ -459,11 +461,11 @@ func (c Context) Output() ml.Context {
 
				 }
			
 
				 
			
 
				 func (c Context) Layer(i int) ml.Context {
			
 
				-	if backend, ok := c.b.layers[i]; ok {
			
 
				+	if buft, ok := c.b.layers[i]; ok {
			
 
				 		return &Context{
			
 
				 			b:             c.b,
			
 
				 			ctx:           c.ctx,
			
 
				-			backend:       backend,
			
 
				+			buft:          buft,
			
 
				 			maxGraphNodes: c.maxGraphNodes,
			
 
				 		}
			
 
				 	}
			
@@ -516,6 +518,10 @@ func shapeToGGML(shape []int) *C.int64_t {
 
				 }
			
 
				 
			
 
				 func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
			
 
				+	if c.buft == nil {
			
 
				+		panic("set Input, Output, or Layer before creating tensors")
			
 
				+	}
			
 
				+
			
 
				 	var cdtype uint32
			
 
				 	switch dtype {
			
 
				 	case ml.DTypeF32:
			
@@ -542,7 +548,7 @@ func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 
				 	}
			
 
				 
			
 
				 	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
			
 
				-	b := C.ggml_backend_alloc_buffer(c.backend, C.ggml_nbytes(t))
			
 
				+	b := C.ggml_backend_buft_alloc_buffer(c.buft, C.ggml_nbytes(t))
			
 
				 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
			
 
				 	return &Tensor{b: c.b, t: t}
			
 
				 }