ggml.go 25 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028
  1. package ggml
  2. // #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
  3. // #include <stdlib.h>
  4. // #include <stdint.h>
  5. // #include "ggml.h"
  6. // #include "ggml-cpu.h"
  7. // #include "ggml-backend.h"
  8. import "C"
  9. import (
  10. "context"
  11. "fmt"
  12. "io"
  13. "log/slog"
  14. "maps"
  15. "os"
  16. "runtime"
  17. "slices"
  18. "strconv"
  19. "strings"
  20. "sync/atomic"
  21. "unicode"
  22. "unsafe"
  23. "github.com/ollama/ollama/format"
  24. fs "github.com/ollama/ollama/fs/ggml"
  25. "github.com/ollama/ollama/ml"
  26. ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
  27. "golang.org/x/sync/errgroup"
  28. )
  29. func devices() []*C.struct_ggml_backend_device {
  30. ggml.OnceLoad()
  31. ds := make([]*C.struct_ggml_backend_device, C.ggml_backend_dev_count())
  32. for i := range ds {
  33. ds[i] = C.ggml_backend_dev_get(C.size_t(i))
  34. }
  35. return ds
  36. }
  37. type Backend struct {
  38. meta *fs.GGML
  39. sched *C.struct_ggml_backend_sched
  40. tensors map[string]*C.struct_ggml_tensor
  41. // input is the backend used for inputs
  42. input *C.struct_ggml_backend_buffer_type
  43. // output is the backend used for outputs
  44. output *C.struct_ggml_backend_buffer_type
  45. // layers is the backend used for repeating layers
  46. layers map[int]*C.struct_ggml_backend_buffer_type
  47. flashAttention bool
  48. // maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
  49. maxGraphNodes int
  50. }
  51. func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
  52. meta, n, err := fs.Decode(r, -1)
  53. if err != nil {
  54. return nil, err
  55. }
  56. slog.Info(
  57. "",
  58. "architecture", meta.KV().Architecture(),
  59. "file_type", meta.KV().FileType(),
  60. "name", meta.KV().String("general.name"),
  61. "description", meta.KV().String("general.description"),
  62. "num_tensors", len(meta.Tensors().Items()),
  63. "num_key_values", len(meta.KV()),
  64. )
  65. type deviceBufferType struct {
  66. d *C.struct_ggml_backend_device
  67. bts []*C.struct_ggml_backend_buffer_type
  68. }
  69. var cpus, accels, gpus []*C.struct_ggml_backend_device
  70. for _, d := range devices() {
  71. switch C.ggml_backend_dev_type(d) {
  72. case C.GGML_BACKEND_DEVICE_TYPE_CPU:
  73. if len(cpus) == 0 {
  74. // only the first cpu device should be used
  75. cpus = append(cpus, d)
  76. }
  77. case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
  78. accels = append(accels, d)
  79. case C.GGML_BACKEND_DEVICE_TYPE_GPU:
  80. gpus = append(gpus, d)
  81. }
  82. }
  83. // create list of buffer types for the cpu
  84. cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
  85. for _, d := range append(accels, append(gpus, cpus...)...) {
  86. switch C.ggml_backend_dev_type(d) {
  87. case C.GGML_BACKEND_DEVICE_TYPE_CPU,
  88. C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
  89. cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
  90. }
  91. }
  92. // create list of buffer types for each gpu
  93. var gpuDeviceBufferTypes []deviceBufferType
  94. for _, d := range gpus {
  95. bt := C.ggml_backend_dev_buffer_type(d)
  96. gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
  97. d: d,
  98. bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
  99. })
  100. }
  101. useDefaultSplit := true
  102. for _, s := range params.TensorSplit {
  103. if s != 0 {
  104. useDefaultSplit = false
  105. break
  106. }
  107. }
  108. // calculate splits
  109. splits := make([]float32, len(gpus))
  110. if useDefaultSplit {
  111. // default: split on free memory
  112. for i := range splits {
  113. var free, total C.size_t
  114. C.ggml_backend_dev_memory(gpus[i], &free, &total)
  115. splits[i] = float32(free)
  116. }
  117. } else {
  118. splits = params.TensorSplit
  119. }
  120. var sum float32
  121. // cumulative sum of all splits
  122. for i := range splits {
  123. sum += splits[i]
  124. splits[i] = sum
  125. }
  126. // normalize splits
  127. for i := range splits {
  128. splits[i] /= sum
  129. }
  130. // inputs always use cpu
  131. input := cpuDeviceBufferType
  132. blocks := int(meta.KV().BlockCount())
  133. // define a range of gpu layers. anything outside of this range is assigned to the cpu
  134. gpuRangeStart := max(0, blocks-params.NumGPULayers)
  135. gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
  136. assignLayer := func(i int) deviceBufferType {
  137. if i < gpuRangeStart || i >= gpuRangeStop {
  138. return cpuDeviceBufferType
  139. }
  140. index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
  141. if index < 0 || index >= len(gpuDeviceBufferTypes) {
  142. return cpuDeviceBufferType
  143. }
  144. return gpuDeviceBufferTypes[index]
  145. }
  146. // repeating layers are assigned based on their index in reverse order, e.g. i / (block_count + 1)
  147. layers := make([]deviceBufferType, blocks)
  148. for i := range layers {
  149. layers[i] = assignLayer(i)
  150. }
  151. // outputs are assigned iff allowed by splits and configured number of gpu layers
  152. output := assignLayer(blocks)
  153. maxTensors := len(meta.Tensors().Items())
  154. maxTensors += 1
  155. // each layer has at most 2 extra tensors for rope operations
  156. maxTensors += blocks * 2
  157. type tensor struct {
  158. source *fs.Tensor
  159. target string
  160. }
  161. // some tensors are mapped to different names so keep a list
  162. targets := make(map[string][]string)
  163. // contexts are shared by tensors of the same buffer type
  164. ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
  165. createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
  166. for _, bt := range bts {
  167. if _, ok := ctxs[bt]; !ok {
  168. ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
  169. mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
  170. no_alloc: true,
  171. })
  172. }
  173. targets[t.source.Name] = append(targets[t.source.Name], t.target)
  174. name := t.source.Name
  175. if t.target != "" {
  176. name = t.target
  177. }
  178. cname := C.CString(name)
  179. defer C.free(unsafe.Pointer(cname))
  180. if tt := C.ggml_get_tensor(ctxs[bt], cname); tt != nil {
  181. return tt
  182. }
  183. tt := C.ggml_new_tensor(ctxs[bt], t.source.Kind, C.int(len(t.source.Shape)), (*C.int64_t)(unsafe.Pointer(&t.source.Shape[0])))
  184. C.ggml_set_name(tt, cname)
  185. slog.Debug("created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
  186. //nolint:staticcheck // TODO: check if buffer type supports this tensor
  187. return tt
  188. }
  189. return nil
  190. }
  191. contains := func(s string, parts ...string) bool {
  192. split := strings.Split(s, ".")
  193. for _, part := range parts {
  194. if slices.Contains(split, part) {
  195. return true
  196. }
  197. }
  198. return false
  199. }
  200. for _, t := range meta.Tensors().Items() {
  201. switch {
  202. case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
  203. createTensor(tensor{source: t}, input.bts)
  204. if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
  205. createTensor(tensor{source: t, target: "output.weight"}, output.bts)
  206. }
  207. case contains(t.Name, "cls", "output", "output_norm"):
  208. createTensor(tensor{source: t}, output.bts)
  209. case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
  210. // TODO: assign vision tensors to the gpu if possible
  211. createTensor(tensor{source: t}, output.bts)
  212. case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
  213. // these tensors should be repeated per layer
  214. for i, layer := range layers {
  215. createTensor(tensor{
  216. source: t,
  217. target: "blk." + strconv.Itoa(i) + "." + t.Name,
  218. }, layer.bts)
  219. }
  220. default:
  221. layerIndex := -1
  222. if fields := strings.FieldsFunc(t.Name, func(r rune) bool { return !unicode.IsNumber(r) }); len(fields) > 0 {
  223. if i, err := strconv.Atoi(fields[0]); err == nil {
  224. layerIndex = i
  225. }
  226. }
  227. if layerIndex >= 0 {
  228. createTensor(tensor{source: t}, layers[layerIndex].bts)
  229. } else {
  230. // load all other tensors on the cpu
  231. createTensor(tensor{source: t}, input.bts)
  232. }
  233. }
  234. }
  235. // allocate buffers for each context
  236. bbs := make(map[*C.struct_ggml_context]*C.struct_ggml_backend_buffer, len(ctxs))
  237. for bt, c := range ctxs {
  238. if C.ggml_get_first_tensor(c) == nil {
  239. continue
  240. }
  241. b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
  242. C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
  243. bbs[c] = b
  244. }
  245. for bs := range maps.Values(bbs) {
  246. slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs))))
  247. }
  248. // map tensor names to tensors for easy lookup later
  249. tensors := make(map[string]*C.struct_ggml_tensor)
  250. for _, c := range ctxs {
  251. for t := C.ggml_get_first_tensor(c); t != nil; t = C.ggml_get_next_tensor(c, t) {
  252. tensors[C.GoString(C.ggml_get_name(t))] = t
  253. }
  254. }
  255. var doneBytes atomic.Uint64
  256. totalBytes := uint64(n) - meta.Tensors().Offset
  257. g, ctx := errgroup.WithContext(ctx)
  258. g.SetLimit(runtime.GOMAXPROCS(0))
  259. for _, t := range meta.Tensors().Items() {
  260. g.Go(func() error {
  261. tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
  262. for i := range tts {
  263. target := targets[t.Name][i]
  264. if target == "" {
  265. target = t.Name
  266. }
  267. tt, ok := tensors[target]
  268. if !ok {
  269. return fmt.Errorf("unassigned tensor: %s", t.Name)
  270. }
  271. tts[i] = tt
  272. }
  273. sr := io.NewSectionReader(r, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
  274. bts := make([]byte, 128*format.KibiByte)
  275. var s uint64
  276. for s < t.Size() {
  277. n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
  278. if err != nil {
  279. return err
  280. }
  281. for _, tt := range tts {
  282. C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
  283. }
  284. s += uint64(n)
  285. if params.Progress != nil {
  286. done := doneBytes.Add(uint64(n))
  287. params.Progress(float32(done) / float32(totalBytes))
  288. }
  289. }
  290. return nil
  291. })
  292. }
  293. // start a goroutine to cancel the errgroup if the parent context is done
  294. go func() {
  295. <-ctx.Done()
  296. g.Go(func() error {
  297. return ctx.Err()
  298. })
  299. }()
  300. if err := g.Wait(); err != nil {
  301. return nil, err
  302. }
  303. // map devices to backend buffer types so new tensors can be assigned to the correct device
  304. deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
  305. // create backends and buffer types used for the compute graph scheduler
  306. var schedBackends []*C.struct_ggml_backend
  307. var schedBufts []*C.struct_ggml_backend_buffer_type
  308. for _, d := range append(gpus, append(accels, cpus...)...) {
  309. b := C.ggml_backend_dev_init(d, nil)
  310. bt := C.ggml_backend_get_default_buffer_type(b)
  311. if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
  312. // use the first gpu host buffer type for gpu if possible
  313. if hbt := C.ggml_backend_dev_host_buffer_type(gpus[0]); hbt != nil {
  314. bt = hbt
  315. }
  316. }
  317. deviceBufferTypes[d] = bt
  318. schedBackends = append(schedBackends, b)
  319. schedBufts = append(schedBufts, bt)
  320. slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
  321. if C.ggml_backend_is_cpu(b) {
  322. // set number of threads for cpu backend
  323. C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(params.NumThreads)))
  324. }
  325. }
  326. maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
  327. return &Backend{
  328. flashAttention: params.FlashAttention,
  329. meta: meta,
  330. tensors: tensors,
  331. sched: C.ggml_backend_sched_new(
  332. (*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
  333. (*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
  334. C.int(len(schedBackends)),
  335. C.size_t(maxGraphNodes),
  336. C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
  337. ),
  338. input: deviceBufferTypes[input.d],
  339. output: deviceBufferTypes[output.d],
  340. layers: func() map[int]*C.struct_ggml_backend_buffer_type {
  341. m := make(map[int]*C.struct_ggml_backend_buffer_type)
  342. for i, layer := range layers {
  343. m[i] = deviceBufferTypes[layer.d]
  344. }
  345. return m
  346. }(),
  347. maxGraphNodes: maxGraphNodes,
  348. }, nil
  349. }
  350. func init() {
  351. ml.RegisterBackend("ggml", New)
  352. }
  353. func (b *Backend) Config() ml.Config {
  354. return b.meta.KV()
  355. }
  356. func (b *Backend) Get(name string) ml.Tensor {
  357. if t, ok := b.tensors[name]; ok {
  358. return &Tensor{b: b, t: t}
  359. }
  360. return nil
  361. }
  362. func (b *Backend) NewContext() ml.Context {
  363. return b.NewContextSize(b.maxGraphNodes)
  364. }
  365. func (b *Backend) NewContextSize(n int) ml.Context {
  366. if n > b.maxGraphNodes {
  367. panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
  368. }
  369. return &Context{
  370. b: b,
  371. maxGraphNodes: n,
  372. ctx: C.ggml_init(C.struct_ggml_init_params{
  373. mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
  374. no_alloc: true,
  375. }),
  376. }
  377. }
  378. func (b *Backend) CacheConfig() ml.CacheConfig {
  379. if b.flashAttention {
  380. return ml.CacheConfig{CachePadding: 256, MaskDType: ml.DTypeF16, MaskBatchPadding: C.GGML_KQ_MASK_PAD}
  381. } else {
  382. return ml.CacheConfig{CachePadding: 32, PermutedV: true}
  383. }
  384. }
  385. type Context struct {
  386. b *Backend
  387. ctx *C.struct_ggml_context
  388. graph *C.struct_ggml_cgraph
  389. // buft is the buffer type used for new tensors
  390. buft *C.struct_ggml_backend_buffer_type
  391. // maxGraphNodes is the maximum allowed number of graph nodes in this context
  392. maxGraphNodes int
  393. }
  394. func (c Context) Input() ml.Context {
  395. if c.b.input != nil {
  396. return &Context{
  397. b: c.b,
  398. ctx: c.ctx,
  399. buft: c.b.input,
  400. maxGraphNodes: c.maxGraphNodes,
  401. }
  402. }
  403. return &c
  404. }
  405. func (c Context) Output() ml.Context {
  406. if c.b.output != nil {
  407. return &Context{
  408. b: c.b,
  409. ctx: c.ctx,
  410. buft: c.b.output,
  411. maxGraphNodes: c.maxGraphNodes,
  412. }
  413. }
  414. return &c
  415. }
  416. func (c Context) Layer(i int) ml.Context {
  417. if buft, ok := c.b.layers[i]; ok {
  418. return &Context{
  419. b: c.b,
  420. ctx: c.ctx,
  421. buft: buft,
  422. maxGraphNodes: c.maxGraphNodes,
  423. }
  424. }
  425. return &c
  426. }
  427. func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
  428. if c.graph == nil {
  429. c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
  430. }
  431. for _, tensor := range tensors {
  432. C.ggml_build_forward_expand(c.graph, tensor.(*Tensor).t)
  433. }
  434. return c
  435. }
  436. func (c Context) Compute(tensors ...ml.Tensor) {
  437. C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
  438. C.ggml_backend_sched_reset(c.b.sched)
  439. needSync := true
  440. sync := func() {
  441. if needSync {
  442. C.ggml_backend_sched_synchronize(c.b.sched)
  443. needSync = false
  444. }
  445. }
  446. for _, t := range tensors {
  447. if C.ggml_nbytes(t.(*Tensor).t) > 0 {
  448. t.(*Tensor).sync = sync
  449. }
  450. }
  451. }
  452. func (c Context) MaxGraphNodes() int {
  453. return c.maxGraphNodes
  454. }
  455. func shapeToGGML(shape []int) *C.int64_t {
  456. sh := make([]C.int64_t, len(shape))
  457. for i, s := range shape {
  458. sh[i] = C.int64_t(s)
  459. }
  460. return &sh[0]
  461. }
  462. func pad(length, pad C.size_t) C.size_t {
  463. return ((length + pad - 1) / pad) * pad
  464. }
  465. func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
  466. if c.buft == nil {
  467. panic("set Input, Output, or Layer before creating tensors")
  468. }
  469. var cdtype uint32
  470. switch dtype {
  471. case ml.DTypeF32:
  472. cdtype = C.GGML_TYPE_F32
  473. case ml.DTypeF16:
  474. cdtype = C.GGML_TYPE_F16
  475. case ml.DTypeQ80:
  476. cdtype = C.GGML_TYPE_Q8_0
  477. case ml.DTypeQ40:
  478. cdtype = C.GGML_TYPE_Q4_0
  479. case ml.DTypeI32:
  480. cdtype = C.GGML_TYPE_I32
  481. default:
  482. panic("unsupported dtype")
  483. }
  484. if len(shape) < 1 || shape[0] == 0 {
  485. var shape C.int64_t = 0
  486. return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
  487. } else if len(shape) > 4 {
  488. panic("unsupported number of dimensions")
  489. }
  490. for _, dim := range shape {
  491. if dim < 1 {
  492. panic("invalid shape")
  493. }
  494. }
  495. t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
  496. size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
  497. b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
  498. C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
  499. return &Tensor{b: c.b, t: t}
  500. }
  501. func (c Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
  502. return c.newTensor(dtype, shape)
  503. }
  504. func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
  505. t := c.newTensor(dtype, shape)
  506. C.ggml_set_zero(t.(*Tensor).t)
  507. return t
  508. }
  509. func checkShape[S ~[]E, E any](s S, shape ...int) error {
  510. n := len(s)
  511. if n == 0 {
  512. return nil
  513. }
  514. for _, v := range shape {
  515. n /= v
  516. }
  517. if n != 1 {
  518. return fmt.Errorf("invalid shape: %v", shape)
  519. }
  520. return nil
  521. }
  522. func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
  523. if err := checkShape(s, shape...); err != nil {
  524. return nil, err
  525. }
  526. t := c.newTensor(ml.DTypeF32, shape)
  527. if len(s) > 0 {
  528. C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
  529. }
  530. return t, nil
  531. }
  532. func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
  533. if err := checkShape(s, shape...); err != nil {
  534. return nil, err
  535. }
  536. t := c.newTensor(ml.DTypeI32, shape)
  537. if len(s) > 0 {
  538. C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
  539. }
  540. return t, nil
  541. }
  542. func (c *Context) Close() {
  543. if c != nil {
  544. C.ggml_free(c.ctx)
  545. }
  546. }
  547. type Tensor struct {
  548. b *Backend
  549. t *C.struct_ggml_tensor
  550. sync func()
  551. }
  552. func (t *Tensor) LogValue() slog.Value {
  553. return slog.GroupValue(
  554. slog.String("name", C.GoString(C.ggml_get_name(t.t))),
  555. slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
  556. slog.Any("shape", t.Shape()),
  557. )
  558. }
  559. func (t *Tensor) Dim(n int) int {
  560. return int(t.t.ne[n])
  561. }
  562. func (t *Tensor) Stride(n int) int {
  563. return int(t.t.nb[n])
  564. }
  565. func (t *Tensor) Shape() []int {
  566. shape := make([]int, C.ggml_n_dims(t.t))
  567. for i := range shape {
  568. shape[i] = t.Dim(i)
  569. }
  570. return shape
  571. }
  572. func (t *Tensor) Bytes() (data []byte) {
  573. if t.sync != nil {
  574. data = make([]byte, C.ggml_nbytes(t.t))
  575. t.sync()
  576. C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
  577. }
  578. return
  579. }
  580. func (t *Tensor) Floats() (data []float32) {
  581. if t.sync != nil {
  582. data = make([]float32, C.ggml_nelements(t.t))
  583. t.sync()
  584. C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
  585. }
  586. return
  587. }
  588. func (t *Tensor) DType() ml.DType {
  589. switch t.t._type {
  590. case C.GGML_TYPE_F32:
  591. return ml.DTypeF32
  592. case C.GGML_TYPE_F16:
  593. return ml.DTypeF16
  594. case C.GGML_TYPE_Q8_0:
  595. return ml.DTypeQ80
  596. case C.GGML_TYPE_Q4_0:
  597. return ml.DTypeQ40
  598. case C.GGML_TYPE_I32:
  599. return ml.DTypeI32
  600. default:
  601. return ml.DTypeOther
  602. }
  603. }
  604. func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  605. return &Tensor{
  606. b: t.b,
  607. t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
  608. }
  609. }
  610. func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
  611. if len(s) > 0 {
  612. return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
  613. }
  614. return t
  615. }
  616. func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
  617. return &Tensor{
  618. b: t.b,
  619. t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
  620. }
  621. }
  622. func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
  623. return &Tensor{
  624. b: t.b,
  625. t: C.ggml_cont(ctx.(*Context).ctx, t.t),
  626. }
  627. }
  628. func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  629. return &Tensor{
  630. b: t.b,
  631. t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
  632. }
  633. }
  634. func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  635. return &Tensor{
  636. b: t.b,
  637. t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
  638. }
  639. }
  640. func (t *Tensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  641. mul := C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t)
  642. C.ggml_mul_mat_set_prec(mul, C.GGML_PREC_F32)
  643. return &Tensor{
  644. b: t.b,
  645. t: mul,
  646. }
  647. }
  648. func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
  649. tt := (&Tensor{b: t.b, t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
  650. if b != nil {
  651. tt = tt.Add(ctx, b)
  652. }
  653. return tt
  654. }
  655. func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
  656. return (&Tensor{b: t.b, t: C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
  657. }
  658. func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
  659. if len(shape) != 4 {
  660. panic("expected 4 dimensions")
  661. }
  662. return &Tensor{
  663. b: t.b,
  664. t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
  665. }
  666. }
  667. func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
  668. if len(shape) != 4 {
  669. panic("expected 4 dimensions")
  670. }
  671. return &Tensor{
  672. b: t.b,
  673. t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
  674. }
  675. }
  676. func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  677. return &Tensor{
  678. b: t.b,
  679. t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
  680. }
  681. }
  682. func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
  683. return &Tensor{
  684. b: t.b,
  685. t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
  686. }
  687. }
  688. func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
  689. switch len(shape) {
  690. case 1:
  691. return &Tensor{
  692. b: t.b,
  693. t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
  694. }
  695. case 2:
  696. return &Tensor{
  697. b: t.b,
  698. t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
  699. }
  700. case 3:
  701. return &Tensor{
  702. b: t.b,
  703. t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
  704. }
  705. case 4:
  706. return &Tensor{
  707. b: t.b,
  708. t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
  709. }
  710. default:
  711. panic("unsupported number of dimensions")
  712. }
  713. }
  714. func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
  715. return &Tensor{
  716. b: t.b,
  717. t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
  718. }
  719. }
  720. func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
  721. return &Tensor{
  722. b: t.b,
  723. t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
  724. }
  725. }
  726. func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
  727. return &Tensor{
  728. b: t.b,
  729. t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
  730. }
  731. }
  732. func (t *Tensor) Unpad(ctx ml.Context, shape ...int) ml.Tensor {
  733. if len(shape) != 4 {
  734. panic("expected 4 dimensions")
  735. }
  736. return &Tensor{
  737. b: t.b,
  738. t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
  739. }
  740. }
  741. func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
  742. switch len(shape) {
  743. case 1:
  744. return &Tensor{
  745. b: t.b,
  746. t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
  747. }
  748. case 3:
  749. return &Tensor{
  750. b: t.b,
  751. t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
  752. C.int64_t(shape[0]), C.int64_t(shape[2]),
  753. C.size_t(shape[1]),
  754. C.size_t(offset)),
  755. }
  756. case 5:
  757. return &Tensor{
  758. b: t.b,
  759. t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
  760. C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
  761. C.size_t(shape[1]), C.size_t(shape[3]),
  762. C.size_t(offset)),
  763. }
  764. case 7:
  765. return &Tensor{
  766. b: t.b,
  767. t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
  768. C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
  769. C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
  770. C.size_t(offset)),
  771. }
  772. default:
  773. panic("unsupported number of dimensions")
  774. }
  775. }
  776. const (
  777. ropeTypeNorm C.int = 0
  778. ropeTypeNeox C.int = 2
  779. ropeTypeMrope C.int = 8
  780. ropeTypeVision C.int = 24
  781. )
  782. func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
  783. if ropeFactors == nil {
  784. ropeFactors = &Tensor{b: t.b}
  785. }
  786. dequant := t.t
  787. if C.ggml_is_quantized(t.t._type) {
  788. dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
  789. }
  790. return &Tensor{
  791. b: t.b,
  792. t: C.ggml_rope_ext(
  793. ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
  794. C.int(ropeDim),
  795. C.int(ropeType),
  796. 131072, // YaRN n_ctx_train
  797. C.float(ropeBase),
  798. C.float(ropeScale),
  799. 0., // YaRN ext_factor
  800. 1., // YaRN attn_factor
  801. 32., // YaRN beta_fast
  802. 1., // YaRN beta_slow
  803. ),
  804. }
  805. }
  806. func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
  807. return &Tensor{
  808. b: t.b,
  809. t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
  810. }
  811. }
  812. func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
  813. return &Tensor{
  814. b: t.b,
  815. t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
  816. }
  817. }
  818. func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
  819. return &Tensor{
  820. b: t.b,
  821. t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
  822. }
  823. }
  824. func (t *Tensor) AvgPool2D(ctx ml.Context, k, s int, p float32) ml.Tensor {
  825. return &Tensor{
  826. b: t.b,
  827. t: C.ggml_pool_2d(ctx.(*Context).ctx, t.t, C.GGML_OP_POOL_AVG, C.int(k), C.int(k), C.int(s), C.int(s), C.float(p), C.float(p)),
  828. }
  829. }
  830. func (t *Tensor) Set(ctx ml.Context, t2 ml.Tensor, offset int, strides ...int) ml.Tensor {
  831. var tt *C.struct_ggml_tensor
  832. switch len(strides) {
  833. case 0:
  834. tt = C.ggml_set_1d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset))
  835. case 1:
  836. tt = C.ggml_set_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.size_t(offset), C.size_t(strides[0]))
  837. default:
  838. panic("unsupported number of dimensions")
  839. }
  840. return &Tensor{b: t.b, t: tt}
  841. }
  842. func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.Tensor, scale float64) ml.Tensor {
  843. var kqMask *C.struct_ggml_tensor
  844. if mask != nil {
  845. kqMask = mask.(*Tensor).t
  846. }
  847. query := t.Permute(ctx, 0, 2, 1, 3)
  848. key = key.Permute(ctx, 0, 2, 1, 3)
  849. if t.b.flashAttention {
  850. value = value.Permute(ctx, 0, 2, 1, 3)
  851. kqv := C.ggml_flash_attn_ext(ctx.(*Context).ctx, query.(*Tensor).t, key.(*Tensor).t, value.(*Tensor).t, kqMask, C.float(scale), 0, 0)
  852. C.ggml_flash_attn_ext_set_prec(kqv, C.GGML_PREC_F32)
  853. return &Tensor{b: t.b, t: kqv}
  854. } else {
  855. kq := key.MulmatFullPrec(ctx, query)
  856. kq = &Tensor{
  857. b: t.b,
  858. t: C.ggml_soft_max_ext(ctx.(*Context).ctx, kq.(*Tensor).t, kqMask, C.float(scale), 0),
  859. }
  860. kqv := value.Mulmat(ctx, kq)
  861. return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
  862. }
  863. }