2 月之前 · 057cc54b66
--- a/benchmark/ggml_backend_benchmark_test.go
+++ b/benchmark/ggml_backend_benchmark_test.go
@@ -0,0 +1,86 @@
 
															+package backend
														
 
															+
														
 
															+import (
														
 
															+	"flag"
														
 
															+	"fmt"
														
 
															+	"io"
														
 
															+	"log"
														
 
															+	"os"
														
 
															+	"testing"
														
 
															+
														
 
															+	"github.com/ollama/ollama/ml"
														
 
															+	"github.com/ollama/ollama/model"
														
 
															+	"github.com/ollama/ollama/server"
														
 
															+
														
 
															+	_ "github.com/ollama/ollama/model/models/llama"
														
 
															+)
														
 
															+
														
 
															+var modelName = flag.String("m", "", "Name of the model to benchmark")
														
 
															+
														
 
															+func suppressOutput() (cleanup func()) {
														
 
															+	oldStdout, oldStderr := os.Stdout, os.Stderr
														
 
															+	os.Stdout, os.Stderr = nil, nil
														
 
															+	log.SetOutput(io.Discard)
														
 
															+
														
 
															+	return func() {
														
 
															+		os.Stdout, os.Stderr = oldStdout, oldStderr
														
 
															+		log.SetOutput(os.Stderr)
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func setupModel(b *testing.B) model.Model {
														
 
															+	if *modelName == "" {
														
 
															+		b.Fatal("Error: -m flag is required for benchmark tests")
														
 
															+	}
														
 
															+
														
 
															+	sm, err := server.GetModel(*modelName)
														
 
															+	if err != nil {
														
 
															+		b.Fatal(err)
														
 
															+	}
														
 
															+
														
 
															+	m, err := model.New(sm.ModelPath)
														
 
															+	if err != nil {
														
 
															+		b.Fatal(err)
														
 
															+	}
														
 
															+
														
 
															+	m.Config().Cache.Init(m.Backend(), ml.DTypeF32, 2048)
														
 
															+	return m
														
 
															+}
														
 
															+
														
 
															+func BenchmarkGGMLOperations(b *testing.B) {
														
 
															+	// loading the GGML back-end logs to standard out and makes the bench output messy
														
 
															+	cleanup := suppressOutput()
														
 
															+	defer cleanup()
														
 
															+
														
 
															+	b.Setenv("OLLAMA_BENCHMARK", "1")
														
 
															+	b.Setenv("OLLAMA_BACKEND", "ggml")
														
 
															+
														
 
															+	m := setupModel(b)
														
 
															+
														
 
															+	// Sample input data
														
 
															+	inputIDs := []int32{1, 2, 3, 4, 5}
														
 
															+	options := model.Options{
														
 
															+		Inputs:    inputIDs,
														
 
															+		Positions: []int32{1, 2, 3, 4, 5},
														
 
															+		Sequences: []int{1, 1, 1, 1, 1},
														
 
															+		Outputs:   []int32{int32(len(inputIDs) - 1)},
														
 
															+	}
														
 
															+
														
 
															+	b.ResetTimer()
														
 
															+
														
 
															+	for range b.N {
														
 
															+		ctx := m.Backend().NewContext()
														
 
															+		defer ctx.Close()
														
 
															+
														
 
															+		modelOutput, err := model.Forward(ctx, m, options)
														
 
															+		if err != nil {
														
 
															+			b.Fatal(fmt.Errorf("forward pass failed: %v", err))
														
 
															+		}
														
 
															+
														
 
															+		ctx.Compute(modelOutput)
														
 
															+
														
 
															+		for _, op := range ctx.Timing() {
														
 
															+			b.ReportMetric(op.Duration, fmt.Sprintf("%s_ms", op.Type))
														
 
															+		}
														
 
															+	}
														
 
															+}
														
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -167,6 +167,8 @@ var (
 
															 	MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
														
 
															 	// Enable the new Ollama engine
														
 
															 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
														
 
															+	// Ollama is running in a benchmark context, additional timing data will be collected.
														
 
															+	Benchmark = Bool("OLLAMA_BENCHMARK")
														
 
															 )
														
 
															 func String(s string) func() string {
														
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -352,6 +352,10 @@ func (c *testContext) MaxTensors() int {
 
															 	return 10
														
 
															 }
														
 
															+func (c *testContext) Timing() []ml.OpTiming {
														
 
															+	return []ml.OpTiming{}
														
 
															+}
														
 
															+
														
 
															 func (c *testContext) Close() {}
														
 
															 type testTensor struct {
														
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -2,6 +2,7 @@ package ml
 
															 import (
														
 
															 	"bytes"
														
 
															+	"cmp"
														
 
															 	"encoding/binary"
														
 
															 	"fmt"
														
 
															 	"os"
														
@@ -37,7 +38,7 @@ func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
 
															 }
														
 
															 func NewBackend(f *os.File) (Backend, error) {
														
 
															-	if backend, ok := backends["ggml"]; ok {
														
 
															+	if backend, ok := backends[cmp.Or(os.Getenv("OLLAMA_BACKEND"), "ggml")]; ok {
														
 
															 		return backend(f)
														
 
															 	}
														
@@ -53,6 +54,30 @@ type Context interface {
 
															 	Compute(...Tensor)
														
 
															 	MaxTensors() int
														
 
															 	Close()
														
 
															+
														
 
															+	Timing() []OpTiming
														
 
															+}
														
 
															+
														
 
															+// OpType is the type of operation performed during a forward pass.
														
 
															+type OpType string
														
 
															+
														
 
															+const (
														
 
															+	View       OpType = "View"
														
 
															+	Copy       OpType = "Copy"
														
 
															+	Reshape    OpType = "Reshape"
														
 
															+	Permute    OpType = "Permute"
														
 
															+	Contiguous OpType = "Contiguous"
														
 
															+	Input      OpType = "Input"
														
 
															+	ComputeOp  OpType = "Compute"
														
 
															+	Transpose  OpType = "Transpose"
														
 
															+)
														
 
															+
														
 
															+// OpTiming stores the timing information for a single operation.
														
 
															+type OpTiming struct {
														
 
															+	Type      OpType
														
 
															+	Operation string
														
 
															+	Duration  float64
														
 
															+	Order     int
														
 
															 }
														
 
															 type Tensor interface {
														
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -4,6 +4,8 @@ package ggml
 
															 #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
														
 
															 #include <stdlib.h>
														
 
															 #include <stdint.h>
														
 
															+#include <time.h>
														
 
															+#include <string.h>
														
 
															 #include "ggml.h"
														
 
															 #include "ggml-cpu.h"
														
 
															 #include "ggml-backend.h"
														
@@ -21,6 +23,54 @@ COMPILER inline get_compiler() {
 
															 #endif
														
 
															 }
														
 
															+// Define a fixed-size struct to store timing data
														
 
															+#define MAX_TENSOR_NAME 256
														
 
															+#define MAX_TIMINGS 1000
														
 
															+
														
 
															+typedef struct {
														
 
															+    char tensor_name[MAX_TENSOR_NAME];
														
 
															+    double duration_ms;
														
 
															+} timing_entry;
														
 
															+
														
 
															+typedef struct {
														
 
															+    timing_entry entries[MAX_TIMINGS];
														
 
															+    int count;
														
 
															+} timing_data;
														
 
															+
														
 
															+// Global timing data structure
														
 
															+timing_data g_timings = {0};
														
 
															+
														
 
															+double get_time_ms() {
														
 
															+    struct timespec ts;
														
 
															+    clock_gettime(CLOCK_MONOTONIC, &ts);
														
 
															+    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
														
 
															+}
														
 
															+
														
 
															+bool debug_callback(struct ggml_tensor * t, bool ask, void * user_data) {
														
 
															+    static double start_time;
														
 
															+    static char current_tensor[MAX_TENSOR_NAME];
														
 
															+
														
 
															+    if (ask) {
														
 
															+        start_time = get_time_ms();
														
 
															+        strncpy(current_tensor, t->name, MAX_TENSOR_NAME - 1);
														
 
															+        current_tensor[MAX_TENSOR_NAME - 1] = '\0';
														
 
															+    } else {
														
 
															+        double end_time = get_time_ms();
														
 
															+        double duration = end_time - start_time;
														
 
															+
														
 
															+        if (g_timings.count < MAX_TIMINGS) {
														
 
															+            strncpy(g_timings.entries[g_timings.count].tensor_name, current_tensor, MAX_TENSOR_NAME - 1);
														
 
															+            g_timings.entries[g_timings.count].duration_ms = duration;
														
 
															+            g_timings.count++;
														
 
															+        }
														
 
															+    }
														
 
															+    return true;
														
 
															+}
														
 
															+
														
 
															+void clear_timings() {
														
 
															+    g_timings.count = 0;
														
 
															+}
														
 
															+
														
 
															 */
														
 
															 import "C"
														
@@ -29,9 +79,11 @@ import (
 
															 	"io"
														
 
															 	"log/slog"
														
 
															 	"os"
														
 
															+	"strings"
														
 
															 	"sync"
														
 
															 	"unsafe"
														
 
															+	"github.com/ollama/ollama/envconfig"
														
 
															 	"github.com/ollama/ollama/format"
														
 
															 	fs "github.com/ollama/ollama/fs/ggml"
														
 
															 	"github.com/ollama/ollama/ml"
														
@@ -256,7 +308,62 @@ func (c *Context) Forward(t ml.Tensor) {
 
															 	C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
														
 
															 }
														
 
															+// Timing retrieves the collected timing data
														
 
															+func (c *Context) Timing() []ml.OpTiming {
														
 
															+	sequence := make([]ml.OpTiming, C.g_timings.count)
														
 
															+
														
 
															+	for i := range int(C.g_timings.count) {
														
 
															+		entry := C.g_timings.entries[i]
														
 
															+		tensorName := C.GoString(&entry.tensor_name[0])
														
 
															+
														
 
															+		// Determine operation type and description based on tensor name
														
 
															+		var opType ml.OpType
														
 
															+		var opDesc string
														
 
															+
														
 
															+		switch {
														
 
															+		case strings.Contains(tensorName, "(view)"):
														
 
															+			opType, opDesc = ml.View, "Memory view"
														
 
															+		case strings.Contains(tensorName, "(copy)") || strings.Contains(tensorName, "(copy of"):
														
 
															+			opType, opDesc = ml.Copy, "Memory copy"
														
 
															+		case strings.Contains(tensorName, "(reshaped)"):
														
 
															+			opType, opDesc = ml.Reshape, "Reshape"
														
 
															+		case strings.Contains(tensorName, "(permuted)"):
														
 
															+			opType, opDesc = ml.Permute, "Permute dimensions"
														
 
															+		case strings.Contains(tensorName, "(cont)"):
														
 
															+			opType, opDesc = ml.Contiguous, "Make contiguous"
														
 
															+		case strings.Contains(tensorName, "(transposed)"):
														
 
															+			opType, opDesc = ml.Transpose, "Transpose"
														
 
															+		case strings.HasPrefix(tensorName, "leaf_"):
														
 
															+			opType, opDesc = ml.Input, fmt.Sprintf("Input tensor %s", tensorName)
														
 
															+		case strings.HasPrefix(tensorName, "node_"):
														
 
															+			opType, opDesc = ml.ComputeOp, fmt.Sprintf("Computation %s", tensorName)
														
 
															+		default:
														
 
															+			opType, opDesc = "Unknown", tensorName
														
 
															+		}
														
 
															+
														
 
															+		sequence[i] = ml.OpTiming{
														
 
															+			Type:      opType,
														
 
															+			Operation: opDesc,
														
 
															+			Duration:  float64(entry.duration_ms),
														
 
															+			Order:     i,
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return sequence
														
 
															+}
														
 
															+
														
 
															 func (c *Context) Compute(tensors ...ml.Tensor) {
														
 
															+	if envconfig.Benchmark() {
														
 
															+		// Clear previous timings before new computation
														
 
															+		C.clear_timings()
														
 
															+
														
 
															+		C.ggml_backend_sched_set_eval_callback(
														
 
															+			c.sched,
														
 
															+			C.ggml_backend_eval_callback(C.debug_callback),
														
 
															+			nil,
														
 
															+		)
														
 
															+	}
														
 
															+
														
 
															 	C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
														
 
															 	needSync := true