import { Artwork, RawArtwork } from './types'; import { HuggingFaceTransformersEmbeddings } from 'langchain/embeddings/hf_transformers'; import { Chroma } from "langchain/vectorstores/chroma"; import { Document } from "langchain/document"; import { ChromaClient } from "chromadb"; const numberOfArtworks = 10; // list of artists we are going to pull from the API const artists = ["van Gogh", "Renoir", "Monet", "Picasso"] const generateSource = async () => { // Delete the existing vector store so that we don't get duplicate documents await new ChromaClient().deleteCollection({ name: "artcollection", }); const allartworkdocs = await getArt(artists); // Create the vector store const vectorStore = await Chroma.fromDocuments(allartworkdocs, embedding, { collectionName: "artcollection" }); console.log(`Created vector store with ${await vectorStore.collection?.count()} documents`); } const getArt = async (artists: string[]) => { const artworks: Artwork[] = []; const artistsWorkIds: number[] = [] for (const artist of artists) { // First get the ids of the works by each artist const thisIds = await fetchArtistWorkIds(artist); console.log(`Fetching ${artist}`); await (new Promise(r => setTimeout(r, 1000))); artistsWorkIds.push(...thisIds); }; // now get the actual artwork const artwork = await fetchArtwork(artistsWorkIds); return artwork } const fetchArtistWorkIds = async (artist: string): Promise => { const artistURL = `https://api.artic.edu/api/v1/artworks/search?q=${artist}&limit=${numberOfArtworks}`; const response = await fetch(artistURL); const json = await response.json(); const artistWorks: { id: number }[] = json.data; return artistWorks.map((work) => work.id); } const embedding = new HuggingFaceTransformersEmbeddings({ modelName: "Xenova/all-MiniLM-L6-v2", }); //Turns out there are some weird characters in the descriptions const sanitize = (badstring: string): string => { let goodstring = " "; if (badstring !== null) { goodstring = badstring .replace(/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]>/gm, "") .replace(/<\/a>/gm, "") .replace(/<\/?em>/gm, "") .replace(/[\u2018\u2019]/gm, "") .replace(/[\u201C\u201D]/gm, "") .replace(/[\u2013\u2014]/gm, "-") .replace(/[\u2026]/gm, "...") .replace(/[\u00A0]/gm, " ") .replace(/[\u00AD]/gm, "-") .replace(/[\u00B0]/gm, " degrees ") .replace(/[\u00B1]/gm, " plus or minus ") .replace(/[\u00B2]/gm, " squared ") .replace(/[\u00B3]/gm, " cubed ") .replace(/[\u00B4]/gm, "'") .replace(/[\u00B5]/gm, " micro ") .replace(/[\u00B6]/gm, " paragraph ") .replace(/[\u00B7]/gm, " dot ") .replace(/[\u00B8]/gm, ",") .replace(/[\u00B9]/gm, " first ") .replace(/[\u00BA]/gm, " degrees ") .replace(/[\u00BB]/gm, ">>") .replace(/[\u00BC]/gm, " 1/4 ") .replace(/[\u00BD]/gm, " 1/2 ") .replace(/[\uFB01]/gm, "fi") .replace(/[\uFB02]/gm, "fl") .replace(/[\uFB03]/gm, "ffi") .replace(/[\uFB04]/gm, "ffl") .replace(/[\uFB05]/gm, "ft") .replace(/[\uFB06\uFB07\uFB08]/gm, "st") .replace(/[\u00D7]/gm, "x") .replace(/[\u00E8\u00E9]/gm, "e") .replace(/[\u00F1]/gm, "n") .replace(/[\u00F6]/gm, "o") .replace(/[\u00F8]/gm, "o") .replace(/[\u00FC]/gm, "u") .replace(/[\u00FF]/gm, "y") .replace(/[\u0101\u0103\u00E0]/gm, "a") .replace(/[\u00C9]/gm, "E") .replace(/

/gm, "") .replace(/<\/p>/gm, "") .replace(/\n/gm, ""); }; return goodstring; } const fetchArtwork = async (workids: number[]) => { const docsarray = []; const artworks: Artwork[] = []; for await (const workid of workids) { const artworkURL = `https://api.artic.edu/api/v1/artworks/${workid}`; const response = await fetch(artworkURL); const json = await response.json(); const artworkraw: RawArtwork = await json.data as RawArtwork; const description = sanitize(artworkraw.description) if (description !== " ") { const doc = new Document({ pageContent: description, metadata: { title: sanitize(artworkraw.title), date: artworkraw.date_end, artistName: artworkraw.artist_title, } }); docsarray.push(doc); console.log("------------------") console.log(`${artworkraw.title} - ${artworkraw.artist_title}`); } } return docsarray; } generateSource();