123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329 |
- // Package blobstore implements a blob store.
- package blobstore
- import (
- "bytes"
- "crypto/sha256"
- "errors"
- "fmt"
- "io"
- "io/fs"
- "os"
- "path/filepath"
- "strings"
- "time"
- "github.com/ollama/ollama/x/types/structs"
- )
- var (
- ErrInvalidID = errors.New("invalid ID")
- )
- const HashSize = 32
- // An ID is a blob output key, the hash of an output of a computation.
- type ID struct {
- a [HashSize]byte
- }
- func (id ID) MarshalText() ([]byte, error) {
- return []byte(id.String()), nil
- }
- func (id *ID) UnmarshalText(text []byte) error {
- *id = ParseID(string(text))
- return nil
- }
- func ParseID(s string) ID {
- const prefix = "sha256-"
- h, ok := strings.CutPrefix(s, prefix)
- if !ok {
- return ID{}
- }
- if len(h) != HashSize*2 {
- return ID{}
- }
- var b []byte
- _, err := fmt.Sscanf(h, "%x", &b)
- if err != nil {
- return ID{}
- }
- var id ID
- copy(id.a[:], b)
- return id
- }
- func (id ID) String() string {
- if !id.Valid() {
- return ""
- }
- return fmt.Sprintf("sha256-%x", id.a[:])
- }
- func (id ID) Valid() bool {
- return id != ID{}
- }
- func (id ID) Match(h [HashSize]byte) bool {
- return id.a == h
- }
- // A Store is a blob store, backed by a file system directory tree.
- type Store struct {
- dir string
- now func() time.Time
- }
- // Open opens and returns the store in the given directory.
- //
- // It is safe for multiple processes on a single machine to use the
- // same store directory in a local file system simultaneously.
- // They will coordinate using operating system file locks and may
- // duplicate effort but will not corrupt the store.
- //
- // However, it is NOT safe for multiple processes on different machines
- // to share a store directory (for example, if the directory were stored
- // in a network file system). File locking is notoriously unreliable in
- // network file systems and may not suffice to protect the store.
- func Open(dir string) (*Store, error) {
- info, err := os.Stat(dir)
- if err != nil {
- return nil, err
- }
- if !info.IsDir() {
- return nil, &fs.PathError{Op: "open", Path: dir, Err: fmt.Errorf("not a directory")}
- }
- if err := os.MkdirAll(filepath.Join(dir, "blobs"), 0777); err != nil {
- return nil, err
- }
- c := &Store{
- dir: dir,
- now: time.Now,
- }
- return c, nil
- }
- func (s *Store) Dir() string {
- return s.dir
- }
- // fileName returns the name of the blob file corresponding to the given id.
- func (s *Store) fileName(id ID) string {
- return filepath.Join(s.dir, "blobs", fmt.Sprintf("sha256-%x", id.a[:]))
- }
- // An entryNotFoundError indicates that a store entry was not found, with an
- // optional underlying reason.
- type entryNotFoundError struct {
- Err error
- }
- func (e *entryNotFoundError) Error() string {
- if e.Err == nil {
- return "store entry not found"
- }
- return fmt.Sprintf("store entry not found: %v", e.Err)
- }
- func (e *entryNotFoundError) Unwrap() error {
- return e.Err
- }
- type Entry struct {
- _ structs.Incomparable
- ID ID
- Size int64
- Time time.Time // when added to store
- }
- // GetFile looks up the blob ID in the store and returns
- // the name of the corresponding data file.
- func GetFile(s *Store, id ID) (file string, entry Entry, err error) {
- entry, err = s.Get(id)
- if err != nil {
- return "", Entry{}, err
- }
- file = s.OutputFilename(entry.ID)
- info, err := os.Stat(file)
- if err != nil {
- return "", Entry{}, &entryNotFoundError{Err: err}
- }
- if info.Size() != entry.Size {
- return "", Entry{}, &entryNotFoundError{Err: errors.New("file incomplete")}
- }
- return file, entry, nil
- }
- // GetBytes looks up the blob ID in the store and returns
- // the corresponding output bytes.
- // GetBytes should only be used for data that can be expected to fit in memory.
- func GetBytes(s *Store, id ID) ([]byte, Entry, error) {
- entry, err := s.Get(id)
- if err != nil {
- return nil, entry, err
- }
- data, _ := os.ReadFile(s.OutputFilename(entry.ID))
- if entry.ID.Match(sha256.Sum256(data)) {
- return nil, entry, &entryNotFoundError{Err: errors.New("bad checksum")}
- }
- return data, entry, nil
- }
- // OutputFilename returns the name of the blob file for the given ID.
- func (s *Store) OutputFilename(id ID) string {
- file := s.fileName(id)
- // TODO(bmizerany): touch as "used" for cache trimming. (see
- // cache.go in cmd/go/internal/cache for the full reference implementation to go off of.
- return file
- }
- // Get looks up the blob ID in the store,
- // returning the corresponding output ID and file size, if any.
- // Note that finding an output ID does not guarantee that the
- // saved file for that output ID is still available.
- func (s *Store) Get(id ID) (Entry, error) {
- file := s.fileName(id)
- info, err := os.Stat(file)
- if err != nil {
- return Entry{}, &entryNotFoundError{Err: err}
- }
- return Entry{
- ID: id,
- Size: info.Size(),
- Time: info.ModTime(),
- }, nil
- }
- func (s *Store) Close() error {
- // TODO(bmizerany): return c.Trim()
- return nil
- }
- // Put stores the data read from the given file into the store as ID.
- //
- // It may read file twice. The content of file must not change between the
- // two passes.
- func (s *Store) Put(file io.ReadSeeker) (ID, int64, error) {
- return s.put(file)
- }
- func PutBytes(s *Store, data []byte) (ID, int64, error) {
- return s.Put(bytes.NewReader(data))
- }
- func PutString(s *Store, data string) (ID, int64, error) {
- return s.Put(strings.NewReader(data))
- }
- func (s *Store) put(file io.ReadSeeker) (ID, int64, error) {
- // Compute output ID.
- h := sha256.New()
- if _, err := file.Seek(0, 0); err != nil {
- return ID{}, 0, err
- }
- size, err := io.Copy(h, file)
- if err != nil {
- return ID{}, 0, err
- }
- var out ID
- h.Sum(out.a[:0])
- // Copy to blob file (if not already present).
- if err := s.copyFile(file, out, size); err != nil {
- return out, size, err
- }
- // TODO: Add to manifest index.
- return out, size, nil
- }
- // copyFile copies file into the store, expecting it to have the given
- // output ID and size, if that file is not present already.
- func (s *Store) copyFile(file io.ReadSeeker, out ID, size int64) error {
- name := s.fileName(out)
- println("name", name)
- info, err := os.Stat(name)
- if err == nil && info.Size() == size {
- // Check hash.
- if f, err := os.Open(name); err == nil {
- h := sha256.New()
- io.Copy(h, f)
- f.Close()
- var out2 ID
- h.Sum(out2.a[:0])
- if out == out2 {
- return nil
- }
- }
- // Hash did not match. Fall through and rewrite file.
- }
- // Copy file to blobs directory.
- mode := os.O_RDWR | os.O_CREATE
- if err == nil && info.Size() > size { // shouldn't happen but fix in case
- mode |= os.O_TRUNC
- }
- f, err := os.OpenFile(name, mode, 0666)
- if err != nil {
- return err
- }
- defer f.Close()
- if size == 0 {
- // File now exists with correct size.
- // Only one possible zero-length file, so contents are OK too.
- // Early return here makes sure there's a "last byte" for code below.
- return nil
- }
- // From here on, if any of the I/O writing the file fails,
- // we make a best-effort attempt to truncate the file f
- // before returning, to avoid leaving bad bytes in the file.
- // Copy file to f, but also into h to double-check hash.
- if _, err := file.Seek(0, 0); err != nil {
- f.Truncate(0)
- return err
- }
- h := sha256.New()
- w := io.MultiWriter(f, h)
- if _, err := io.CopyN(w, file, size-1); err != nil {
- f.Truncate(0)
- return err
- }
- // Check last byte before writing it; writing it will make the size match
- // what other processes expect to find and might cause them to start
- // using the file.
- buf := make([]byte, 1)
- if _, err := file.Read(buf); err != nil {
- f.Truncate(0)
- return err
- }
- h.Write(buf)
- sum := h.Sum(nil)
- if !bytes.Equal(sum, out.a[:]) {
- f.Truncate(0)
- return fmt.Errorf("file content changed underfoot")
- }
- // Commit manifest entry.
- if _, err := f.Write(buf); err != nil {
- f.Truncate(0)
- return err
- }
- if err := f.Close(); err != nil {
- // Data might not have been written,
- // but file may look like it is the right size.
- // To be extra careful, remove stored file.
- os.Remove(name)
- return err
- }
- os.Chtimes(name, s.now(), s.now()) // mainly for tests
- return nil
- }
|