Browse Source

llm: suppress large allocations for GGUF arrays

This introduces a little array type for holding GGUF arrays that
prevents the array from growing too large. It preserves the total size
of the array, but limits the number of elements that are actually
allocated.

GGUF arrays that are extremely large, such as tokens, etc, are generally
uninteresting to users, and are not worth the memory overhead, and the
time spent allocating and freeing them. They are necessary for
inference, but not for inspection.

The size of these arrays is, however, important in Ollama, so it is
preserved in a separate field on array.
Blake Mizerany 10 months ago
parent
commit
acbffa59e9
2 changed files with 31 additions and 7 deletions
  1. 1 1
      llm/ggml.go
  2. 30 6
      llm/gguf.go

+ 1 - 1
llm/ggml.go

@@ -321,7 +321,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 	embedding := llm.KV().EmbeddingLength()
 	heads := llm.KV().HeadCount()
 	headsKV := llm.KV().HeadCountKV()
-	vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
+	vocab := llm.KV()["tokenizer.ggml.tokens"].(*array).size
 
 	embeddingHeads := llm.KV().EmbeddingHeadCount()
 	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()

+ 30 - 6
llm/gguf.go

@@ -316,7 +316,7 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
 	return err
 }
 
-func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
+func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
 	t, err := readGGUF[uint32](llm, r)
 	if err != nil {
 		return nil, err
@@ -327,6 +327,8 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
 		return nil, err
 	}
 
+	a := &array{size: uint64(n)}
+
 	for i := 0; uint32(i) < n; i++ {
 		var e any
 		switch t {
@@ -361,13 +363,27 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
 			return nil, err
 		}
 
-		a = append(a, e)
+		if len(a.values) < arrayMaxSize {
+			a.values = append(a.values, e)
+		}
 	}
 
-	return
+	return a, nil
+}
+
+const arrayMaxSize = 1000
+
+type array struct {
+	size uint64
+
+	// values is the slice of values in the array.
+	//
+	// Its length may be less than size if the array is too big to reaonably
+	// fit in memory. The current limit si arrayMaxSize.
+	values []any
 }
 
-func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
+func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
 	if llm.Version == 1 {
 		return readGGUFV1Array(llm, r)
 	}
@@ -382,6 +398,8 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 		return nil, err
 	}
 
+	a := &array{size: n}
+
 	for i := 0; uint64(i) < n; i++ {
 		var e any
 		switch t {
@@ -416,10 +434,16 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
 			return nil, err
 		}
 
-		a = append(a, e)
+		// TODO(bmizerany): We may want to only enforce this limit
+		// on certain fields, however, as of now, I (bmizerany) do
+		// not know of any array fields that are needed by Ollama that
+		// exceed this limit.
+		if len(a.values) < arrayMaxSize {
+			a.values = append(a.values, e)
+		}
 	}
 
-	return
+	return a, nil
 }
 
 func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {