generateSource.ts 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. import { Artwork, RawArtwork } from './types';
  2. import { HuggingFaceTransformersEmbeddings } from 'langchain/embeddings/hf_transformers';
  3. import { Chroma } from "langchain/vectorstores/chroma";
  4. import { Document } from "langchain/document";
  5. import { ChromaClient } from "chromadb";
  6. const numberOfArtworks = 15;
  7. // list of artists we are going to pull from the API
  8. const artists = ["van Gogh", "Renoir", "Monet", "Picasso"]
  9. const generateSource = async () => {
  10. // Delete the existing vector store so that we don't get duplicate documents
  11. await new ChromaClient().deleteCollection({
  12. name: "artcollection",
  13. });
  14. const allartworkdocs = await getArt(artists);
  15. // Create the vector store
  16. const vectorStore = await Chroma.fromDocuments(allartworkdocs, embedding, { collectionName: "artcollection" });
  17. console.log(`Created vector store with ${await vectorStore.collection?.count()} documents`);
  18. }
  19. const getArt = async (artists: string[]) => {
  20. const artworks: Artwork[] = [];
  21. const artistsWorkIds: number[] = []
  22. for (const artist of artists) {
  23. // First get the ids of the works by each artist
  24. const thisIds = await fetchArtistWorkIds(artist);
  25. console.log(`Fetching ${artist}`);
  26. await (new Promise(r => setTimeout(r, 1000)));
  27. artistsWorkIds.push(...thisIds);
  28. };
  29. // now get the actual artwork
  30. const artwork = await fetchArtwork(artistsWorkIds);
  31. return artwork
  32. }
  33. const fetchArtistWorkIds = async (artist: string): Promise<number[]> => {
  34. const artistURL = `https://api.artic.edu/api/v1/artworks/search?q=${artist}&limit=${numberOfArtworks}`;
  35. const response = await fetch(artistURL);
  36. const json = await response.json();
  37. const artistWorks: { id: number }[] = json.data;
  38. const justIds = artistWorks.map((work) => work.id);
  39. return justIds;
  40. }
  41. const embedding = new HuggingFaceTransformersEmbeddings({
  42. modelName: "Xenova/all-MiniLM-L6-v2",
  43. });
  44. //Turns out there are some weird characters in the descriptions
  45. const sanitize = (badstring: string): string => {
  46. let goodstring = " ";
  47. if (badstring !== null) {
  48. goodstring = badstring
  49. .replace(/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]>/gm, "")
  50. .replace(/<\/a>/gm, "")
  51. .replace(/<\/?em>/gm, "")
  52. .replace(/[\u2018\u2019]/gm, "")
  53. .replace(/[\u201C\u201D]/gm, "")
  54. .replace(/[\u2013\u2014]/gm, "-")
  55. .replace(/[\u2026]/gm, "...")
  56. .replace(/[\u00A0]/gm, " ")
  57. .replace(/[\u00AD]/gm, "-")
  58. .replace(/[\u00B0]/gm, " degrees ")
  59. .replace(/[\u00B1]/gm, " plus or minus ")
  60. .replace(/[\u00B2]/gm, " squared ")
  61. .replace(/[\u00B3]/gm, " cubed ")
  62. .replace(/[\u00B4]/gm, "'")
  63. .replace(/[\u00B5]/gm, " micro ")
  64. .replace(/[\u00B6]/gm, " paragraph ")
  65. .replace(/[\u00B7]/gm, " dot ")
  66. .replace(/[\u00B8]/gm, ",")
  67. .replace(/[\u00B9]/gm, " first ")
  68. .replace(/[\u00BA]/gm, " degrees ")
  69. .replace(/[\u00BB]/gm, ">>")
  70. .replace(/[\u00BC]/gm, " 1/4 ")
  71. .replace(/[\u00BD]/gm, " 1/2 ")
  72. .replace(/[\uFB01]/gm, "fi")
  73. .replace(/[\uFB02]/gm, "fl")
  74. .replace(/[\uFB03]/gm, "ffi")
  75. .replace(/[\uFB04]/gm, "ffl")
  76. .replace(/[\uFB05]/gm, "ft")
  77. .replace(/[\uFB06\uFB07\uFB08]/gm, "st")
  78. .replace(/[\u00D7]/gm, "x")
  79. .replace(/[\u00E8\u00E9]/gm, "e")
  80. .replace(/[\u00F1]/gm, "n")
  81. .replace(/[\u00F6]/gm, "o")
  82. .replace(/[\u00F8]/gm, "o")
  83. .replace(/[\u00FC]/gm, "u")
  84. .replace(/[\u00FF]/gm, "y")
  85. .replace(/[\u0101\u0103\u00E0]/gm, "a")
  86. .replace(/[\u00C9]/gm, "E")
  87. .replace(/<p>/gm, "")
  88. .replace(/<\/p>/gm, "")
  89. .replace(/\n/gm, "");
  90. };
  91. return goodstring;
  92. }
  93. const fetchArtwork = async (workids: number[]) => {
  94. const docsarray = [];
  95. const artworks: Artwork[] = [];
  96. for await (const workid of workids) {
  97. const artworkURL = `https://api.artic.edu/api/v1/artworks/${workid}`;
  98. const response = await fetch(artworkURL);
  99. const json = await response.json();
  100. const artworkraw: RawArtwork = await json.data as RawArtwork;
  101. const description = sanitize(artworkraw.description)
  102. if (description !== " ") {
  103. const doc = new Document({
  104. pageContent: description,
  105. metadata: {
  106. title: sanitize(artworkraw.title),
  107. date: artworkraw.date_end,
  108. artistName: artworkraw.artist_title,
  109. }
  110. });
  111. docsarray.push(doc);
  112. console.log("------------------")
  113. console.log(`${artworkraw.title} - ${artworkraw.artist_title}`);
  114. }
  115. }
  116. return docsarray;
  117. }
  118. generateSource();