2 月之前 · 057cc54b66
--- a/benchmark/ggml_backend_benchmark_test.go
+++ b/benchmark/ggml_backend_benchmark_test.go
@@ -0,0 +1,86 @@
 
				+package backend
			
 
				+
			
 
				+import (
			
 
				+	"flag"
			
 
				+	"fmt"
			
 
				+	"io"
			
 
				+	"log"
			
 
				+	"os"
			
 
				+	"testing"
			
 
				+
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+	"github.com/ollama/ollama/model"
			
 
				+	"github.com/ollama/ollama/server"
			
 
				+
			
 
				+	_ "github.com/ollama/ollama/model/models/llama"
			
 
				+)
			
 
				+
			
 
				+var modelName = flag.String("m", "", "Name of the model to benchmark")
			
 
				+
			
 
				+func suppressOutput() (cleanup func()) {
			
 
				+	oldStdout, oldStderr := os.Stdout, os.Stderr
			
 
				+	os.Stdout, os.Stderr = nil, nil
			
 
				+	log.SetOutput(io.Discard)
			
 
				+
			
 
				+	return func() {
			
 
				+		os.Stdout, os.Stderr = oldStdout, oldStderr
			
 
				+		log.SetOutput(os.Stderr)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func setupModel(b *testing.B) model.Model {
			
 
				+	if *modelName == "" {
			
 
				+		b.Fatal("Error: -m flag is required for benchmark tests")
			
 
				+	}
			
 
				+
			
 
				+	sm, err := server.GetModel(*modelName)
			
 
				+	if err != nil {
			
 
				+		b.Fatal(err)
			
 
				+	}
			
 
				+
			
 
				+	m, err := model.New(sm.ModelPath)
			
 
				+	if err != nil {
			
 
				+		b.Fatal(err)
			
 
				+	}
			
 
				+
			
 
				+	m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
			
 
				+	return m
			
 
				+}
			
 
				+
			
 
				+func BenchmarkGGMLOperations(b *testing.B) {
			
 
				+	// loading the GGML back-end logs to standard out and makes the bench output messy
			
 
				+	cleanup := suppressOutput()
			
 
				+	defer cleanup()
			
 
				+
			
 
				+	b.Setenv("OLLAMA_BENCHMARK", "1")
			
 
				+	b.Setenv("OLLAMA_BACKEND", "ggml")
			
 
				+
			
 
				+	m := setupModel(b)
			
 
				+
			
 
				+	// Sample input data
			
 
				+	inputIDs := []int32{1, 2, 3, 4, 5}
			
 
				+	options := model.Options{
			
 
				+		Inputs:    inputIDs,
			
 
				+		Positions: []int32{1, 2, 3, 4, 5},
			
 
				+		Sequences: []int{1, 1, 1, 1, 1},
			
 
				+		Outputs:   []int32{int32(len(inputIDs) - 1)},
			
 
				+	}
			
 
				+
			
 
				+	b.ResetTimer()
			
 
				+
			
 
				+	for range b.N {
			
 
				+		ctx := m.Backend().NewContext()
			
 
				+		defer ctx.Close()
			
 
				+
			
 
				+		modelOutput, err := model.Forward(ctx, m, options)
			
 
				+		if err != nil {
			
 
				+			b.Fatal(fmt.Errorf("forward pass failed: %v", err))
			
 
				+		}
			
 
				+
			
 
				+		ctx.Compute(modelOutput)
			
 
				+
			
 
				+		for _, op := range ctx.Timing() {
			
 
				+			b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -167,6 +167,8 @@ var (
 
				 	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
			
 
				 	// Enable the new Ollama engine
			
 
				 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
			
 
				+	// Ollama is running in a benchmark context, additional timing data will be collected.
			
 
				+	Benchmark = Bool("OLLAMA_BENCHMARK")
			
 
				 )
			
 
				 
			
 
				 func String(s string) func() string {
			
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int {
 
				 	return 10
			
 
				 }
			
 
				 
			
 
				+func (c *testContext) Timing() []ml.OpTiming {
			
 
				+	return []ml.OpTiming{}
			
 
				+}
			
 
				+
			
 
				 func (c *testContext) Close() {}
			
 
				 
			
 
				 type testTensor struct {
			
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -2,6 +2,7 @@ package ml
 
				 
			
 
				 import (
			
 
				 	"bytes"
			
 
				+	"cmp"
			
 
				 	"encoding/binary"
			
 
				 	"fmt"
			
 
				 	"os"
			
@@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
 
				 }
			
 
				 
			
 
				 func NewBackend(f *os.File) (Backend, error) {
			
 
				-	if backend, ok := backends["ggml"]; ok {
			
 
				+	if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
			
 
				 		return backend(f)
			
 
				 	}
			
 
				 
			
@@ -53,6 +54,30 @@ type Context interface {
 
				 	Compute(...Tensor)
			
 
				 	MaxTensors() int
			
 
				 	Close()
			
 
				+
			
 
				+	Timing() []OpTiming
			
 
				+}
			
 
				+
			
 
				+// OpType is the type of operation performed during a forward pass.
			
 
				+type OpType string
			
 
				+
			
 
				+const (
			
 
				+	View       OpType = "View"
			
 
				+	Copy       OpType = "Copy"
			
 
				+	Reshape    OpType = "Reshape"
			
 
				+	Permute    OpType = "Permute"
			
 
				+	Contiguous OpType = "Contiguous"
			
 
				+	Input      OpType = "Input"
			
 
				+	ComputeOp  OpType = "Compute"
			
 
				+	Transpose  OpType = "Transpose"
			
 
				+)
			
 
				+
			
 
				+// OpTiming stores the timing information for a single operation.
			
 
				+type OpTiming struct {
			
 
				+	Type      OpType
			
 
				+	Operation string
			
 
				+	Duration  float64
			
 
				+	Order     int
			
 
				 }
			
 
				 
			
 
				 type Tensor interface {
			
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -4,6 +4,8 @@ package ggml
 
				 #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
			
 
				 #include <stdlib.h>
			
 
				 #include <stdint.h>
			
 
				+#include <time.h>
			
 
				+#include <string.h>
			
 
				 #include "ggml.h"
			
 
				 #include "ggml-cpu.h"
			
 
				 #include "ggml-backend.h"
			
@@ -21,6 +23,54 @@ COMPILER inline get_compiler() {
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+// Define a fixed-size struct to store timing data
			
 
				+#define MAX_TENSOR_NAME 256
			
 
				+#define MAX_TIMINGS 1000
			
 
				+
			
 
				+typedef struct {
			
 
				+    char tensor_name[MAX_TENSOR_NAME];
			
 
				+    double duration_ms;
			
 
				+} timing_entry;
			
 
				+
			
 
				+typedef struct {
			
 
				+    timing_entry entries[MAX_TIMINGS];
			
 
				+    int count;
			
 
				+} timing_data;
			
 
				+
			
 
				+// Global timing data structure
			
 
				+timing_data g_timings = {0};
			
 
				+
			
 
				+double get_time_ms() {
			
 
				+    struct timespec ts;
			
 
				+    clock_gettime(CLOCK_MONOTONIC, &ts);
			
 
				+    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
			
 
				+}
			
 
				+
			
 
				+bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
			
 
				+    static double start_time;
			
 
				+    static char current_tensor[MAX_TENSOR_NAME];
			
 
				+
			
 
				+    if (ask) {
			
 
				+        start_time = get_time_ms();
			
 
				+        strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
			
 
				+        current_tensor[MAX_TENSOR_NAME - 1] = '\0';
			
 
				+    } else {
			
 
				+        double end_time = get_time_ms();
			
 
				+        double duration = end_time - start_time;
			
 
				+
			
 
				+        if (g_timings.count < MAX_TIMINGS) {
			
 
				+            strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
			
 
				+            g_timings.entries[g_timings.count].duration_ms = duration;
			
 
				+            g_timings.count++;
			
 
				+        }
			
 
				+    }
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+void clear_timings() {
			
 
				+    g_timings.count = 0;
			
 
				+}
			
 
				+
			
 
				 */
			
 
				 import "C"
			
 
				 
			
@@ -29,9 +79,11 @@ import (
 
				 	"io"
			
 
				 	"log/slog"
			
 
				 	"os"
			
 
				+	"strings"
			
 
				 	"sync"
			
 
				 	"unsafe"
			
 
				 
			
 
				+	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				 	fs "github.com/ollama/ollama/fs/ggml"
			
 
				 	"github.com/ollama/ollama/ml"
			
@@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
 
				 	C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
			
 
				 }
			
 
				 
			
 
				+// Timing retrieves the collected timing data
			
 
				+func (c *Context) Timing() []ml.OpTiming {
			
 
				+	sequence := make([]ml.OpTiming, C.g_timings.count)
			
 
				+
			
 
				+	for i := range int(C.g_timings.count) {
			
 
				+		entry := C.g_timings.entries[i]
			
 
				+		tensorName := C.GoString(&entry.tensor_name[0])
			
 
				+
			
 
				+		// Determine operation type and description based on tensor name
			
 
				+		var opType ml.OpType
			
 
				+		var opDesc string
			
 
				+
			
 
				+		switch {
			
 
				+		case strings.Contains(tensorName, "(view)"):
			
 
				+			opType, opDesc = ml.View, "Memory view"
			
 
				+		case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
			
 
				+			opType, opDesc = ml.Copy, "Memory copy"
			
 
				+		case strings.Contains(tensorName, "(reshaped)"):
			
 
				+			opType, opDesc = ml.Reshape, "Reshape"
			
 
				+		case strings.Contains(tensorName, "(permuted)"):
			
 
				+			opType, opDesc = ml.Permute, "Permute dimensions"
			
 
				+		case strings.Contains(tensorName, "(cont)"):
			
 
				+			opType, opDesc = ml.Contiguous, "Make contiguous"
			
 
				+		case strings.Contains(tensorName, "(transposed)"):
			
 
				+			opType, opDesc = ml.Transpose, "Transpose"
			
 
				+		case strings.HasPrefix(tensorName, "leaf_"):
			
 
				+			opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
			
 
				+		case strings.HasPrefix(tensorName, "node_"):
			
 
				+			opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
			
 
				+		default:
			
 
				+			opType, opDesc = "Unknown", tensorName
			
 
				+		}
			
 
				+
			
 
				+		sequence[i] = ml.OpTiming{
			
 
				+			Type:      opType,
			
 
				+			Operation: opDesc,
			
 
				+			Duration:  float64(entry.duration_ms),
			
 
				+			Order:     i,
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return sequence
			
 
				+}
			
 
				+
			
 
				 func (c *Context) Compute(tensors ...ml.Tensor) {
			
 
				+	if envconfig.Benchmark() {
			
 
				+		// Clear previous timings before new computation
			
 
				+		C.clear_timings()
			
 
				+
			
 
				+		C.ggml_backend_sched_set_eval_callback(
			
 
				+			c.sched,
			
 
				+			C.ggml_backend_eval_callback(C.debug_callback),
			
 
				+			nil,
			
 
				+		)
			
 
				+	}
			
 
				+
			
 
				 	C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
			
 
				 
			
 
				 	needSync := true