ggml.go 26 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063
  1. package ggml
  2. // #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
  3. // #include <stdlib.h>
  4. // #include <stdint.h>
  5. // #include "ggml.h"
  6. // #include "ggml-cpu.h"
  7. // #include "ggml-backend.h"
  8. import "C"
  9. import (
  10. "context"
  11. "fmt"
  12. "io"
  13. "log/slog"
  14. "maps"
  15. "os"
  16. "runtime"
  17. "slices"
  18. "strconv"
  19. "strings"
  20. "sync/atomic"
  21. "unicode"
  22. "unsafe"
  23. "github.com/ollama/ollama/format"
  24. fs "github.com/ollama/ollama/fs/ggml"
  25. "github.com/ollama/ollama/ml"
  26. ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
  27. "golang.org/x/sync/errgroup"
  28. )
  29. func devices() []*C.struct_ggml_backend_device {
  30. ggml.OnceLoad()
  31. ds := make([]*C.struct_ggml_backend_device, C.ggml_backend_dev_count())
  32. for i := range ds {
  33. ds[i] = C.ggml_backend_dev_get(C.size_t(i))
  34. }
  35. return ds
  36. }
  37. type Backend struct {
  38. meta *fs.GGML
  39. sched *C.struct_ggml_backend_sched
  40. tensors map[string]*C.struct_ggml_tensor
  41. // input is the backend used for inputs
  42. input *C.struct_ggml_backend_buffer_type
  43. // output is the backend used for outputs
  44. output *C.struct_ggml_backend_buffer_type
  45. // layers is the backend used for repeating layers
  46. layers map[int]*C.struct_ggml_backend_buffer_type
  47. flashAttention bool
  48. // maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
  49. maxGraphNodes int
  50. }
  51. func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
  52. meta, n, err := fs.Decode(r, -1)
  53. if err != nil {
  54. return nil, err
  55. }
  56. slog.Info(
  57. "",
  58. "architecture", meta.KV().Architecture(),
  59. "file_type", meta.KV().FileType(),
  60. "name", meta.KV().String("general.name"),
  61. "description", meta.KV().String("general.description"),
  62. "num_tensors", len(meta.Tensors().Items()),
  63. "num_key_values", len(meta.KV()),
  64. )
  65. type deviceBufferType struct {
  66. d *C.struct_ggml_backend_device
  67. bts []*C.struct_ggml_backend_buffer_type
  68. }
  69. var cpus, accels, gpus []*C.struct_ggml_backend_device
  70. for _, d := range devices() {
  71. switch C.ggml_backend_dev_type(d) {
  72. case C.GGML_BACKEND_DEVICE_TYPE_CPU:
  73. if len(cpus) == 0 {
  74. // only the first cpu device should be used
  75. cpus = append(cpus, d)
  76. }
  77. case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
  78. accels = append(accels, d)
  79. case C.GGML_BACKEND_DEVICE_TYPE_GPU:
  80. gpus = append(gpus, d)
  81. }
  82. }
  83. // create list of buffer types for the cpu
  84. cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
  85. for _, d := range append(accels, append(gpus, cpus...)...) {
  86. switch C.ggml_backend_dev_type(d) {
  87. case C.GGML_BACKEND_DEVICE_TYPE_CPU,
  88. C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
  89. cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
  90. }
  91. }
  92. // create list of buffer types for each gpu
  93. var gpuDeviceBufferTypes []deviceBufferType
  94. for _, d := range gpus {
  95. bt := C.ggml_backend_dev_buffer_type(d)
  96. gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
  97. d: d,
  98. bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
  99. })
  100. }
  101. useDefaultSplit := true
  102. for _, s := range params.TensorSplit {
  103. if s != 0 {
  104. useDefaultSplit = false
  105. break
  106. }
  107. }
  108. // calculate splits
  109. splits := make([]float32, len(gpus))
  110. if useDefaultSplit {
  111. // default: split on free memory
  112. for i := range splits {
  113. var free, total C.size_t
  114. C.ggml_backend_dev_memory(gpus[i], &free, &total)
  115. splits[i] = float32(free)
  116. }
  117. } else {
  118. splits = params.TensorSplit
  119. }
  120. var sum float32
  121. // cumulative sum of all splits
  122. for i := range splits {
  123. sum += splits[i]
  124. splits[i] = sum
  125. }
  126. // normalize splits
  127. for i := range splits {
  128. splits[i] /= sum
  129. }
  130. // inputs always use cpu
  131. input := cpuDeviceBufferType
  132. blocks := int(meta.KV().BlockCount())
  133. // define a range of gpu layers. anything outside of this range is assigned to the cpu
  134. gpuRangeStart := max(0, blocks-params.NumGPULayers)
  135. gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
  136. assignLayer := func(i int) deviceBufferType {
  137. if i < gpuRangeStart || i >= gpuRangeStop {
  138. return cpuDeviceBufferType
  139. }
  140. index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
  141. if index < 0 || index >= len(gpuDeviceBufferTypes) {
  142. return cpuDeviceBufferType
  143. }
  144. return gpuDeviceBufferTypes[index]
  145. }
  146. // repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
  147. layers := make([]deviceBufferType, blocks)
  148. for i := range layers {
  149. layers[i] = assignLayer(i)
  150. }
  151. // outputs are assigned iff allowed by splits and configured number of gpu layers
  152. output := assignLayer(blocks)
  153. maxTensors := len(meta.Tensors().Items())
  154. maxTensors += 1
  155. // each layer has at most 2 extra tensors for rope operations
  156. maxTensors += blocks * 2
  157. type tensor struct {
  158. source *fs.Tensor
  159. target string
  160. }
  161. // some tensors are mapped to different names so keep a list
  162. targets := make(map[string][]string)
  163. // contexts are shared by tensors of the same buffer type
  164. ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
  165. createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
  166. for _, bt := range bts {
  167. if _, ok := ctxs[bt]; !ok {
  168. ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
  169. mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
  170. no_alloc: true,
  171. })
  172. }
  173. targets[t.source.Name] = append(targets[t.source.Name], t.target)
  174. name := t.source.Name
  175. if t.target != "" {
  176. name = t.target
  177. }
  178. cname := C.CString(name)
  179. defer C.free(unsafe.Pointer(cname))
  180. if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
  181. return tt
  182. }
  183. tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
  184. C.ggml_set_name(tt, cname)
  185. slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
  186. //nolint:staticcheck // TODO: check if buffer type supports this tensor
  187. return tt
  188. }
  189. return nil
  190. }
  191. contains := func(s string, parts ...string) bool {
  192. split := strings.Split(s, ".")
  193. for _, part := range parts {
  194. if slices.Contains(split, part) {
  195. return true
  196. }
  197. }
  198. return false
  199. }
  200. for _, t := range meta.Tensors().Items() {
  201. switch {
  202. case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
  203. createTensor(tensor{source: t}, input.bts)
  204. if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
  205. createTensor(tensor{source: t, target: "output.weight"}, output.bts)
  206. }
  207. case contains(t.Name, "cls", "output", "output_norm"):
  208. createTensor(tensor{source: t}, output.bts)
  209. case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
  210. // TODO: assign vision tensors to the gpu if possible
  211. createTensor(tensor{source: t}, output.bts)
  212. case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
  213. // these tensors should be repeated per layer
  214. for i, layer := range layers {
  215. createTensor(tensor{
  216. source: t,
  217. target: "blk." + strconv.Itoa(i) + "." + t.Name,
  218. }, layer.bts)
  219. }
  220. default:
  221. layerIndex := -1
  222. if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
  223. if i, err := strconv.Atoi(fields[0]); err == nil {
  224. layerIndex = i
  225. }
  226. }
  227. if layerIndex >= 0 {
  228. createTensor(tensor{source: t}, layers[layerIndex].bts)
  229. } else {
  230. // load all other tensors on the cpu
  231. createTensor(tensor{source: t}, input.bts)
  232. }
  233. }
  234. }
  235. // allocate buffers for each context
  236. bbs := make(map[*C.struct_ggml_context]*C.struct_ggml_backend_buffer, len(ctxs))
  237. for bt, c := range ctxs {
  238. if C.ggml_get_first_tensor(c) == nil {
  239. continue
  240. }
  241. b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
  242. C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
  243. bbs[c] = b
  244. }
  245. for bs := range maps.Values(bbs) {
  246. slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
  247. }
  248. // map tensor names to tensors for easy lookup later
  249. tensors := make(map[string]*C.struct_ggml_tensor)
  250. for _, c := range ctxs {
  251. for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
  252. tensors[C.GoString(C.ggml_get_name(t))] = t
  253. }
  254. }
  255. var doneBytes atomic.Uint64
  256. totalBytes := uint64(n) - meta.Tensors().Offset
  257. g, ctx := errgroup.WithContext(ctx)
  258. g.SetLimit(runtime.GOMAXPROCS(0))
  259. for _, t := range meta.Tensors().Items() {
  260. g.Go(func() error {
  261. tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
  262. for i := range tts {
  263. target := targets[t.Name][i]
  264. if target == "" {
  265. target = t.Name
  266. }
  267. tt, ok := tensors[target]
  268. if !ok {
  269. return fmt.Errorf("unassigned tensor: %s", t.Name)
  270. }
  271. tts[i] = tt
  272. }
  273. sr := io.NewSectionReader(r, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
  274. bts := make([]byte, 128*format.KibiByte)
  275. var s uint64
  276. for s < t.Size() {
  277. n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
  278. if err != nil {
  279. return err
  280. }
  281. for _, tt := range tts {
  282. C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
  283. }
  284. s += uint64(n)
  285. if params.Progress != nil {
  286. done := doneBytes.Add(uint64(n))
  287. params.Progress(float32(done) / float32(totalBytes))
  288. }
  289. }
  290. return nil
  291. })
  292. }
  293. // start a goroutine to cancel the errgroup if the parent context is done
  294. go func() {
  295. <-ctx.Done()
  296. g.Go(func() error {
  297. return ctx.Err()
  298. })
  299. }()
  300. if err := g.Wait(); err != nil {
  301. return nil, err
  302. }
  303. // map devices to backend buffer types so new tensors can be assigned to the correct device
  304. deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
  305. // create backends and buffer types used for the compute graph scheduler
  306. var schedBackends []*C.struct_ggml_backend
  307. var schedBufts []*C.struct_ggml_backend_buffer_type
  308. for _, d := range append(gpus, append(accels, cpus...)...) {
  309. b := C.ggml_backend_dev_init(d, nil)
  310. bt := C.ggml_backend_get_default_buffer_type(b)
  311. if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
  312. // use the first gpu host buffer type for gpu if possible
  313. if hbt := C.ggml_backend_dev_host_buffer_type(gpus[0]); hbt != nil {
  314. bt = hbt
  315. }
  316. }
  317. deviceBufferTypes[d] = bt
  318. schedBackends = append(schedBackends, b)
  319. schedBufts = append(schedBufts, bt)
  320. slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
  321. if C.ggml_backend_is_cpu(b) {
  322. // set number of threads for cpu backend
  323. C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
  324. }
  325. }
  326. maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
  327. return &Backend{
  328. flashAttention: params.FlashAttention,
  329. meta: meta,
  330. tensors: tensors,
  331. sched: C.ggml_backend_sched_new(
  332. (*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
  333. (*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
  334. C.int(len(schedBackends)),
  335. C.size_t(maxGraphNodes),
  336. C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
  337. ),
  338. input: deviceBufferTypes[input.d],
  339. output: deviceBufferTypes[output.d],
  340. layers: func() map[int]*C.struct_ggml_backend_buffer_type {
  341. m := make(map[int]*C.struct_ggml_backend_buffer_type)
  342. for i, layer := range layers {
  343. m[i] = deviceBufferTypes[layer.d]
  344. }
  345. return m
  346. }(),
  347. maxGraphNodes: maxGraphNodes,
  348. }, nil
  349. }
  350. func init() {
  351. ml.RegisterBackend("ggml", New)
  352. }
  353. func (b *Backend) Config() ml.Config {
  354. return b.meta.KV()
  355. }
  356. func (b *Backend) Get(name string) ml.Tensor {
  357. if t, ok := b.tensors[name]; ok {
  358. return &Tensor{b: b, t: t}
  359. }
  360. return nil
  361. }
  362. func (b *Backend) NewContext() ml.Context {
  363. return b.NewContextSize(b.maxGraphNodes)
  364. }
  365. func (b *Backend) NewContextSize(n int) ml.Context {
  366. if n > b.maxGraphNodes {
  367. panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
  368. }
  369. return &Context{
  370. b: b,
  371. maxGraphNodes: n,
  372. ctx: C.ggml_init(C.struct_ggml_init_params{
  373. mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
  374. no_alloc: true,
  375. }),
  376. }
  377. }
  378. func (b *Backend) CacheConfig() ml.CacheConfig {
  379. if b.flashAttention {
  380. return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
  381. } else {
  382. return ml.CacheConfig{CachePadding: 32, PermutedV: true}
  383. }
  384. }
  385. type Context struct {
  386. b *Backend
  387. ctx *C.struct_ggml_context
  388. graph *C.struct_ggml_cgraph
  389. // buft is the buffer type used for new tensors
  390. buft *C.struct_ggml_backend_buffer_type
  391. // maxGraphNodes is the maximum allowed number of graph nodes in this context
  392. maxGraphNodes int
  393. }
  394. func (c Context) Input() ml.Context {
  395. if c.b.input != nil {
  396. return &Context{
  397. b: c.b,
  398. ctx: c.ctx,
  399. buft: c.b.input,
  400. maxGraphNodes: c.maxGraphNodes,
  401. }
  402. }
  403. return &c
  404. }
  405. func (c Context) Output() ml.Context {
  406. if c.b.output != nil {
  407. return &Context{
  408. b: c.b,
  409. ctx: c.ctx,
  410. buft: c.b.output,
  411. maxGraphNodes: c.maxGraphNodes,
  412. }
  413. }
  414. return &c
  415. }
  416. func (c Context) Layer(i int) ml.Context {
  417. if buft, ok := c.b.layers[i]; ok {
  418. return &Context{
  419. b: c.b,
  420. ctx: c.ctx,
  421. buft: buft,
  422. maxGraphNodes: c.maxGraphNodes,
  423. }
  424. }
  425. return &c
  426. }
  427. func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
  428. if c.graph == nil {
  429. c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
  430. }
  431. for _, tensor := range tensors {
  432. C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
  433. }
  434. return c
  435. }
  436. func (c Context) Compute(tensors ...ml.Tensor) {
  437. C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
  438. C.ggml_backend_sched_reset(c.b.sched)
  439. needSync := true
  440. sync := func() {
  441. if needSync {
  442. C.ggml_backend_sched_synchronize(c.b.sched)
  443. needSync = false
  444. }
  445. }
  446. for _, t := range tensors {
  447. if C.ggml_nbytes(t.(*Tensor).t) > 0 {
  448. t.(*Tensor).sync = sync
  449. }
  450. }
  451. }
  452. func (c Context) MaxGraphNodes() int {
  453. return c.maxGraphNodes
  454. }
  455. func shapeToGGML(shape []int) *C.int64_t {
  456. sh := make([]C.int64_t, len(shape))
  457. for i, s := range shape {
  458. sh[i] = C.int64_t(s)
  459. }
  460. return &sh[0]
  461. }
  462. func pad(length, pad C.size_t) C.size_t {
  463. return ((length + pad - 1) / pad) * pad
  464. }
  465. func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
  466. if c.buft == nil {
  467. panic("set Input, Output, or Layer before creating tensors")
  468. }
  469. var cdtype uint32
  470. switch dtype {
  471. case ml.DTypeF32:
  472. cdtype = C.GGML_TYPE_F32
  473. case ml.DTypeF16:
  474. cdtype = C.GGML_TYPE_F16
  475. case ml.DTypeQ80:
  476. cdtype = C.GGML_TYPE_Q8_0
  477. case ml.DTypeQ40:
  478. cdtype = C.GGML_TYPE_Q4_0
  479. case ml.DTypeI32:
  480. cdtype = C.GGML_TYPE_I32
  481. default:
  482. panic("unsupported dtype")
  483. }
  484. if len(shape) < 1 || shape[0] == 0 {
  485. var shape C.int64_t = 0
  486. return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
  487. } else if len(shape) > 4 {
  488. panic("unsupported number of dimensions")
  489. }
  490. for _, dim := range shape {
  491. if dim < 1 {
  492. panic("invalid shape")
  493. }
  494. }
  495. t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
  496. size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
  497. b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
  498. C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
  499. return &Tensor{b: c.b, t: t}
  500. }
  501. func (c Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
  502. return c.newTensor(dtype, shape)
  503. }
  504. func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
  505. t := c.newTensor(dtype, shape)
  506. C.ggml_set_zero(t.(*Tensor).t)
  507. return t
  508. }
  509. func checkShape[S ~[]E, E any](s S, shape ...int) error {
  510. n := len(s)
  511. if n == 0 {
  512. return nil
  513. }
  514. for _, v := range shape {
  515. n /= v
  516. }
  517. if n != 1 {
  518. return fmt.Errorf("invalid shape: %v", shape)
  519. }
  520. return nil
  521. }
  522. func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
  523. if err := checkShape(s, shape...); err != nil {
  524. return nil, err
  525. }
  526. t := c.newTensor(ml.DTypeF32, shape)
  527. if len(s) > 0 {
  528. C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
  529. }
  530. return t, nil
  531. }
  532. func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
  533. if err := checkShape(s, shape...); err != nil {
  534. return nil, err
  535. }
  536. t := c.newTensor(ml.DTypeI32, shape)
  537. if len(s) > 0 {
  538. C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
  539. }
  540. return t, nil
  541. }
  542. func (c *Context) Close() {
  543. if c != nil {
  544. C.ggml_free(c.ctx)
  545. }
  546. }
  547. type Tensor struct {
  548. b *Backend
  549. t *C.struct_ggml_tensor
  550. sync func()
  551. }
  552. func (t *Tensor) LogValue() slog.Value {
  553. return slog.GroupValue(
  554. slog.String("name", C.GoString(C.ggml_get_name(t.t))),
  555. slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
  556. slog.Any("shape", t.Shape()),
  557. )
  558. }
  559. func (t *Tensor) Dim(n int) int {
  560. return int(t.t.ne[n])
  561. }
  562. func (t *Tensor) Stride(n int) int {
  563. return int(t.t.nb[n])
  564. }
  565. func (t *Tensor) Shape() []int {
  566. shape := make([]int, C.ggml_n_dims(t.t))
  567. for i := range shape {
  568. shape[i] = t.Dim(i)
  569. }
  570. return shape
  571. }
  572. func (t *Tensor) Bytes() (data []byte) {
  573. if t.sync != nil {
  574. data = make([]byte, C.ggml_nbytes(t.t))
  575. t.sync()
  576. C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
  577. }
  578. return
  579. }
  580. func (t *Tensor) Floats() (data []float32) {
  581. if t.sync != nil {
  582. data = make([]float32, C.ggml_nelements(t.t))
  583. t.sync()
  584. C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
  585. }
  586. return
  587. }
  588. func (t *Tensor) DType() ml.DType {
  589. switch t.t._type {
  590. case C.GGML_TYPE_F32:
  591. return ml.DTypeF32
  592. case C.GGML_TYPE_F16:
  593. return ml.DTypeF16
  594. case C.GGML_TYPE_Q8_0:
  595. return ml.DTypeQ80
  596. case C.GGML_TYPE_Q4_0:
  597. return ml.DTypeQ40
  598. case C.GGML_TYPE_I32:
  599. return ml.DTypeI32
  600. default:
  601. return ml.DTypeOther
  602. }
  603. }
  604. func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  605. return &Tensor{
  606. b: t.b,
  607. t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
  608. }
  609. }
  610. func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
  611. if len(s) > 0 {
  612. return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
  613. }
  614. return t
  615. }
  616. func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
  617. return &Tensor{
  618. b: t.b,
  619. t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
  620. }
  621. }
  622. func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
  623. return &Tensor{
  624. b: t.b,
  625. t: C.ggml_cont(ctx.(*Context).ctx, t.t),
  626. }
  627. }
  628. func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  629. return &Tensor{
  630. b: t.b,
  631. t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
  632. }
  633. }
  634. func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  635. return &Tensor{
  636. b: t.b,
  637. t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
  638. }
  639. }
  640. func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  641. mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
  642. C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)
  643. return &Tensor{
  644. b: t.b,
  645. t: mul,
  646. }
  647. }
  648. func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
  649. tt := (&Tensor{b: t.b, t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
  650. if b != nil {
  651. tt = tt.Add(ctx, b)
  652. }
  653. return tt
  654. }
  655. func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
  656. return (&Tensor{b: t.b, t: C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
  657. }
  658. func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
  659. if len(shape) != 4 {
  660. panic("expected 4 dimensions")
  661. }
  662. return &Tensor{
  663. b: t.b,
  664. t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
  665. }
  666. }
  667. func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
  668. if len(shape) != 4 {
  669. panic("expected 4 dimensions")
  670. }
  671. return &Tensor{
  672. b: t.b,
  673. t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
  674. }
  675. }
  676. func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  677. return &Tensor{
  678. b: t.b,
  679. t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
  680. }
  681. }
  682. func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  683. return &Tensor{
  684. b: t.b,
  685. t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
  686. }
  687. }
  688. func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
  689. switch len(shape) {
  690. case 1:
  691. return &Tensor{
  692. b: t.b,
  693. t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
  694. }
  695. case 2:
  696. return &Tensor{
  697. b: t.b,
  698. t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
  699. }
  700. case 3:
  701. return &Tensor{
  702. b: t.b,
  703. t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
  704. }
  705. case 4:
  706. return &Tensor{
  707. b: t.b,
  708. t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
  709. }
  710. default:
  711. panic("unsupported number of dimensions")
  712. }
  713. }
  714. func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
  715. return &Tensor{
  716. b: t.b,
  717. t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
  718. }
  719. }
  720. func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
  721. return &Tensor{
  722. b: t.b,
  723. t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
  724. }
  725. }
  726. func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
  727. return &Tensor{
  728. b: t.b,
  729. t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
  730. }
  731. }
  732. func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
  733. if len(shape) != 4 {
  734. panic("expected 4 dimensions")
  735. }
  736. return &Tensor{
  737. b: t.b,
  738. t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
  739. }
  740. }
  741. func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
  742. switch len(shape) {
  743. case 1:
  744. return &Tensor{
  745. b: t.b,
  746. t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
  747. }
  748. case 3:
  749. return &Tensor{
  750. b: t.b,
  751. t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
  752. C.int64_t(shape[0]), C.int64_t(shape[2]),
  753. C.size_t(shape[1]),
  754. C.size_t(offset)),
  755. }
  756. case 5:
  757. return &Tensor{
  758. b: t.b,
  759. t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
  760. C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
  761. C.size_t(shape[1]), C.size_t(shape[3]),
  762. C.size_t(offset)),
  763. }
  764. case 7:
  765. return &Tensor{
  766. b: t.b,
  767. t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
  768. C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
  769. C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
  770. C.size_t(offset)),
  771. }
  772. default:
  773. panic("unsupported number of dimensions")
  774. }
  775. }
  776. const (
  777. ropeTypeNorm C.int = 0
  778. ropeTypeNeox C.int = 2
  779. ropeTypeMrope C.int = 8
  780. ropeTypeVision C.int = 24
  781. )
  782. func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
  783. if ropeFactors == nil {
  784. ropeFactors = &Tensor{b: t.b}
  785. }
  786. dequant := t.t
  787. if C.ggml_is_quantized(t.t._type) {
  788. dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
  789. }
  790. return &Tensor{
  791. b: t.b,
  792. t: C.ggml_rope_ext(
  793. ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
  794. C.int(ropeDim),
  795. C.int(ropeType),
  796. 131072, // YaRN n_ctx_train
  797. C.float(ropeBase),
  798. C.float(ropeScale),
  799. 0., // YaRN ext_factor
  800. 1., // YaRN attn_factor
  801. 32., // YaRN beta_fast
  802. 1., // YaRN beta_slow
  803. ),
  804. }
  805. }
  806. func (t *Tensor) RoPEMulti(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, sections [4]int, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
  807. if ropeFactors == nil {
  808. ropeFactors = &Tensor{b: t.b}
  809. }
  810. dequant := t.t
  811. if C.ggml_is_quantized(t.t._type) {
  812. dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
  813. }
  814. return &Tensor{
  815. b: t.b,
  816. t: C.ggml_rope_multi(
  817. ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
  818. C.int(ropeDim),
  819. (*C.int)(unsafe.Pointer(&sections[0])),
  820. C.int(ropeType),
  821. 131072, // YaRN n_ctx_train
  822. C.float(ropeBase),
  823. C.float(ropeScale),
  824. 0., // YaRN ext_factor
  825. 1., // YaRN attn_factor
  826. 32., // YaRN beta_fast
  827. 1., // YaRN beta_slow
  828. ),
  829. }
  830. }
  831. func (t *Tensor) IM2Col(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
  832. return &Tensor{
  833. b: t.b,
  834. t: C.ggml_im2col(ctx.(*Context).ctx, t.t, weight.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
  835. }
  836. }
  837. func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
  838. return &Tensor{
  839. b: t.b,
  840. t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
  841. }
  842. }
  843. func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
  844. return &Tensor{
  845. b: t.b,
  846. t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
  847. }
  848. }
  849. func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
  850. return &Tensor{
  851. b: t.b,
  852. t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
  853. }
  854. }
  855. func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
  856. return &Tensor{
  857. b: t.b,
  858. t: C.ggml_pool_2d(ctx.(*Context).ctx, t.t, C.GGML_OP_POOL_AVG, C.int(k), C.int(k), C.int(s), C.int(s), C.float(p), C.float(p)),
  859. }
  860. }
  861. func (t *Tensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
  862. var tt *C.struct_ggml_tensor
  863. switch len(strides) {
  864. case 0:
  865. tt = C.ggml_set_1d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset))
  866. case 1:
  867. tt = C.ggml_set_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset), C.size_t(strides[0]))
  868. default:
  869. panic("unsupported number of dimensions")
  870. }
  871. return &Tensor{b: t.b, t: tt}
  872. }
  873. func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.Tensor, scale float64) ml.Tensor {
  874. var kqMask *C.struct_ggml_tensor
  875. if mask != nil {
  876. kqMask = mask.(*Tensor).t
  877. }
  878. query := t.Permute(ctx, 0, 2, 1, 3)
  879. key = key.Permute(ctx, 0, 2, 1, 3)
  880. if t.b.flashAttention {
  881. value = value.Permute(ctx, 0, 2, 1, 3)
  882. kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
  883. C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
  884. return &Tensor{b: t.b, t: kqv}
  885. } else {
  886. kq := key.MulmatFullPrec(ctx, query)
  887. kq = &Tensor{
  888. b: t.b,
  889. t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
  890. }
  891. kqv := value.Mulmat(ctx, kq)
  892. return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
  893. }
  894. }