Evaluations.svelte 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. <script lang="ts">
  2. import { onMount, getContext } from 'svelte';
  3. import dayjs from 'dayjs';
  4. import relativeTime from 'dayjs/plugin/relativeTime';
  5. dayjs.extend(relativeTime);
  6. import * as ort from 'onnxruntime-web';
  7. import { AutoModel, AutoTokenizer } from '@huggingface/transformers';
  8. const EMBEDDING_MODEL = 'TaylorAI/bge-micro-v2';
  9. let tokenizer = null;
  10. let model = null;
  11. import { models } from '$lib/stores';
  12. import { deleteFeedbackById, getAllFeedbacks } from '$lib/apis/evaluations';
  13. import FeedbackMenu from './Evaluations/FeedbackMenu.svelte';
  14. import EllipsisHorizontal from '../icons/EllipsisHorizontal.svelte';
  15. import Tooltip from '../common/Tooltip.svelte';
  16. import Badge from '../common/Badge.svelte';
  17. import Pagination from '../common/Pagination.svelte';
  18. import MagnifyingGlass from '../icons/MagnifyingGlass.svelte';
  19. const i18n = getContext('i18n');
  20. let rankedModels = [];
  21. let feedbacks = [];
  22. let query = '';
  23. let page = 1;
  24. let tagEmbeddings = new Map();
  25. let loaded = false;
  26. let debounceTimer;
  27. $: paginatedFeedbacks = feedbacks.slice((page - 1) * 10, page * 10);
  28. type Feedback = {
  29. id: string;
  30. data: {
  31. rating: number;
  32. model_id: string;
  33. sibling_model_ids: string[] | null;
  34. reason: string;
  35. comment: string;
  36. tags: string[];
  37. };
  38. user: {
  39. name: string;
  40. profile_image_url: string;
  41. };
  42. updated_at: number;
  43. };
  44. type ModelStats = {
  45. rating: number;
  46. won: number;
  47. lost: number;
  48. };
  49. //////////////////////
  50. //
  51. // Rank models by Elo rating
  52. //
  53. //////////////////////
  54. const rankHandler = async (similarities: Map<string, number> = new Map()) => {
  55. const modelStats = calculateModelStats(feedbacks, similarities);
  56. rankedModels = $models
  57. .filter((m) => m?.owned_by !== 'arena' && (m?.info?.meta?.hidden ?? false) !== true)
  58. .map((model) => {
  59. const stats = modelStats.get(model.id);
  60. return {
  61. ...model,
  62. rating: stats ? Math.round(stats.rating) : '-',
  63. stats: {
  64. count: stats ? stats.won + stats.lost : 0,
  65. won: stats ? stats.won.toString() : '-',
  66. lost: stats ? stats.lost.toString() : '-'
  67. }
  68. };
  69. })
  70. .sort((a, b) => {
  71. if (a.rating === '-' && b.rating !== '-') return 1;
  72. if (b.rating === '-' && a.rating !== '-') return -1;
  73. if (a.rating !== '-' && b.rating !== '-') return b.rating - a.rating;
  74. return a.name.localeCompare(b.name);
  75. });
  76. };
  77. function calculateModelStats(
  78. feedbacks: Feedback[],
  79. similarities: Map<string, number>
  80. ): Map<string, ModelStats> {
  81. const stats = new Map<string, ModelStats>();
  82. const K = 32;
  83. function getOrDefaultStats(modelId: string): ModelStats {
  84. return stats.get(modelId) || { rating: 1000, won: 0, lost: 0 };
  85. }
  86. function updateStats(modelId: string, ratingChange: number, outcome: number) {
  87. const currentStats = getOrDefaultStats(modelId);
  88. currentStats.rating += ratingChange;
  89. if (outcome === 1) currentStats.won++;
  90. else if (outcome === 0) currentStats.lost++;
  91. stats.set(modelId, currentStats);
  92. }
  93. function calculateEloChange(
  94. ratingA: number,
  95. ratingB: number,
  96. outcome: number,
  97. similarity: number
  98. ): number {
  99. const expectedScore = 1 / (1 + Math.pow(10, (ratingB - ratingA) / 400));
  100. return K * (outcome - expectedScore) * similarity;
  101. }
  102. feedbacks.forEach((feedback) => {
  103. const modelA = feedback.data.model_id;
  104. const statsA = getOrDefaultStats(modelA);
  105. let outcome: number;
  106. switch (feedback.data.rating.toString()) {
  107. case '1':
  108. outcome = 1;
  109. break;
  110. case '-1':
  111. outcome = 0;
  112. break;
  113. default:
  114. return; // Skip invalid ratings
  115. }
  116. // If the query is empty, set similarity to 1, else get the similarity from the map
  117. const similarity = query !== '' ? similarities.get(feedback.id) || 0 : 1;
  118. const opponents = feedback.data.sibling_model_ids || [];
  119. opponents.forEach((modelB) => {
  120. const statsB = getOrDefaultStats(modelB);
  121. const changeA = calculateEloChange(statsA.rating, statsB.rating, outcome, similarity);
  122. const changeB = calculateEloChange(statsB.rating, statsA.rating, 1 - outcome, similarity);
  123. updateStats(modelA, changeA, outcome);
  124. updateStats(modelB, changeB, 1 - outcome);
  125. });
  126. });
  127. return stats;
  128. }
  129. //////////////////////
  130. //
  131. // Calculate cosine similarity
  132. //
  133. //////////////////////
  134. const cosineSimilarity = (vecA, vecB) => {
  135. // Ensure the lengths of the vectors are the same
  136. if (vecA.length !== vecB.length) {
  137. throw new Error('Vectors must be the same length');
  138. }
  139. // Calculate the dot product
  140. let dotProduct = 0;
  141. let normA = 0;
  142. let normB = 0;
  143. for (let i = 0; i < vecA.length; i++) {
  144. dotProduct += vecA[i] * vecB[i];
  145. normA += vecA[i] ** 2;
  146. normB += vecB[i] ** 2;
  147. }
  148. // Calculate the magnitudes
  149. normA = Math.sqrt(normA);
  150. normB = Math.sqrt(normB);
  151. // Avoid division by zero
  152. if (normA === 0 || normB === 0) {
  153. return 0;
  154. }
  155. // Return the cosine similarity
  156. return dotProduct / (normA * normB);
  157. };
  158. const calculateMaxSimilarity = (queryEmbedding, tagEmbeddings: Map<string, number[]>) => {
  159. let maxSimilarity = 0;
  160. for (const tagEmbedding of tagEmbeddings.values()) {
  161. const similarity = cosineSimilarity(queryEmbedding, tagEmbedding);
  162. maxSimilarity = Math.max(maxSimilarity, similarity);
  163. }
  164. return maxSimilarity;
  165. };
  166. //////////////////////
  167. //
  168. // Embedding functions
  169. //
  170. //////////////////////
  171. const getEmbeddings = async (text: string) => {
  172. const tokens = await tokenizer(text);
  173. const output = await model(tokens);
  174. // Perform mean pooling on the last hidden states
  175. const embeddings = output.last_hidden_state.mean(1);
  176. return embeddings.ort_tensor.data;
  177. };
  178. const getTagEmbeddings = async (tags: string[]) => {
  179. const embeddings = new Map();
  180. for (const tag of tags) {
  181. if (!tagEmbeddings.has(tag)) {
  182. tagEmbeddings.set(tag, await getEmbeddings(tag));
  183. }
  184. embeddings.set(tag, tagEmbeddings.get(tag));
  185. }
  186. return embeddings;
  187. };
  188. const debouncedQueryHandler = async () => {
  189. if (query.trim() === '') {
  190. rankHandler();
  191. return;
  192. }
  193. clearTimeout(debounceTimer);
  194. debounceTimer = setTimeout(async () => {
  195. const queryEmbedding = await getEmbeddings(query);
  196. const similarities = new Map<string, number>();
  197. for (const feedback of feedbacks) {
  198. const feedbackTags = feedback.data.tags || [];
  199. const tagEmbeddings = await getTagEmbeddings(feedbackTags);
  200. const maxSimilarity = calculateMaxSimilarity(queryEmbedding, tagEmbeddings);
  201. similarities.set(feedback.id, maxSimilarity);
  202. }
  203. rankHandler(similarities);
  204. }, 1500); // Debounce for 1.5 seconds
  205. };
  206. $: query, debouncedQueryHandler();
  207. //////////////////////
  208. //
  209. // CRUD operations
  210. //
  211. //////////////////////
  212. const deleteFeedbackHandler = async (feedbackId: string) => {
  213. const response = await deleteFeedbackById(localStorage.token, feedbackId).catch((err) => {
  214. toast.error(err);
  215. return null;
  216. });
  217. if (response) {
  218. feedbacks = feedbacks.filter((f) => f.id !== feedbackId);
  219. }
  220. };
  221. onMount(async () => {
  222. feedbacks = await getAllFeedbacks(localStorage.token);
  223. loaded = true;
  224. tokenizer = await AutoTokenizer.from_pretrained(EMBEDDING_MODEL);
  225. model = await AutoModel.from_pretrained(EMBEDDING_MODEL);
  226. // Pre-compute embeddings for all unique tags
  227. const allTags = new Set(feedbacks.flatMap((feedback) => feedback.data.tags || []));
  228. await getTagEmbeddings(Array.from(allTags));
  229. rankHandler();
  230. });
  231. </script>
  232. {#if loaded}
  233. <div class="mt-0.5 mb-2 gap-1 flex flex-col md:flex-row justify-between">
  234. <div class="flex md:self-center text-lg font-medium px-0.5 shrink-0 items-center">
  235. <div class=" gap-1">
  236. {$i18n.t('Leaderboard')}
  237. </div>
  238. <div class="flex self-center w-[1px] h-6 mx-2.5 bg-gray-50 dark:bg-gray-850" />
  239. <span class="text-lg font-medium text-gray-500 dark:text-gray-300 mr-1.5"
  240. >{rankedModels.length}</span
  241. >
  242. </div>
  243. <div class=" flex space-x-2">
  244. <Tooltip content={$i18n.t('Re-rank models by topic similarity')}>
  245. <div class="flex flex-1">
  246. <div class=" self-center ml-1 mr-3">
  247. <MagnifyingGlass className="size-3" />
  248. </div>
  249. <input
  250. class=" w-full text-sm pr-4 py-1 rounded-r-xl outline-none bg-transparent"
  251. bind:value={query}
  252. placeholder={$i18n.t('Search')}
  253. />
  254. </div>
  255. </Tooltip>
  256. </div>
  257. </div>
  258. <div
  259. class="scrollbar-hidden relative whitespace-nowrap overflow-x-auto max-w-full rounded pt-0.5"
  260. >
  261. {#if (rankedModels ?? []).length === 0}
  262. <div class="text-center text-xs text-gray-500 dark:text-gray-400 py-1">
  263. {$i18n.t('No models found')}
  264. </div>
  265. {:else}
  266. <table
  267. class="w-full text-sm text-left text-gray-500 dark:text-gray-400 table-auto max-w-full rounded"
  268. >
  269. <thead
  270. class="text-xs text-gray-700 uppercase bg-gray-50 dark:bg-gray-850 dark:text-gray-400 -translate-y-0.5"
  271. >
  272. <tr class="">
  273. <th scope="col" class="px-3 py-1.5 cursor-pointer select-none w-3">
  274. {$i18n.t('RK')}
  275. </th>
  276. <th scope="col" class="px-3 py-1.5 cursor-pointer select-none">
  277. {$i18n.t('Model')}
  278. </th>
  279. <th scope="col" class="px-3 py-1.5 text-right cursor-pointer select-none w-fit">
  280. {$i18n.t('Rating')}
  281. </th>
  282. <th scope="col" class="px-3 py-1.5 text-right cursor-pointer select-none w-5">
  283. {$i18n.t('Won')}
  284. </th>
  285. <th scope="col" class="px-3 py-1.5 text-right cursor-pointer select-none w-5">
  286. {$i18n.t('Lost')}
  287. </th>
  288. </tr>
  289. </thead>
  290. <tbody class="">
  291. {#each rankedModels as model, modelIdx (model.id)}
  292. <tr class="bg-white dark:bg-gray-900 dark:border-gray-850 text-xs group">
  293. <td class="px-3 py-1.5 text-left font-medium text-gray-900 dark:text-white w-fit">
  294. <div class=" line-clamp-1">
  295. {model?.rating !== '-' ? modelIdx + 1 : '-'}
  296. </div>
  297. </td>
  298. <td class="px-3 py-1.5 flex flex-col justify-center">
  299. <div class="flex items-center gap-2">
  300. <div class="flex-shrink-0">
  301. <img
  302. src={model?.info?.meta?.profile_image_url ?? '/favicon.png'}
  303. alt={model.name}
  304. class="size-5 rounded-full object-cover shrink-0"
  305. />
  306. </div>
  307. <div class="font-medium text-gray-800 dark:text-gray-200 pr-4">
  308. {model.name}
  309. </div>
  310. </div>
  311. </td>
  312. <td class="px-3 py-1.5 text-right font-medium text-gray-900 dark:text-white w-max">
  313. {model.rating}
  314. </td>
  315. <td class=" px-3 py-1.5 text-right font-semibold text-green-500">
  316. <div class=" w-10">
  317. {#if model.stats.won === '-'}
  318. -
  319. {:else}
  320. <span class="hidden group-hover:inline"
  321. >{((model.stats.won / model.stats.count) * 100).toFixed(1)}%</span
  322. >
  323. <span class=" group-hover:hidden">{model.stats.won}</span>
  324. {/if}
  325. </div>
  326. </td>
  327. <td class="px-3 py-1.5 text-right font-semibold text-red-500">
  328. <div class=" w-10">
  329. {#if model.stats.lost === '-'}
  330. -
  331. {:else}
  332. <span class="hidden group-hover:inline"
  333. >{((model.stats.lost / model.stats.count) * 100).toFixed(1)}%</span
  334. >
  335. <span class=" group-hover:hidden">{model.stats.lost}</span>
  336. {/if}
  337. </div>
  338. </td>
  339. </tr>
  340. {/each}
  341. </tbody>
  342. </table>
  343. {/if}
  344. </div>
  345. <div class=" text-gray-500 text-xs mt-1.5 w-full flex justify-end">
  346. <div class=" text-right">
  347. <div class="line-clamp-1">
  348. ⓘ {$i18n.t(
  349. 'The evaluation leaderboard is based on the Elo rating system and is updated in real-time.'
  350. )}
  351. </div>
  352. {$i18n.t(
  353. 'The leaderboard is currently in beta, and we may adjust the rating calculations as we refine the algorithm.'
  354. )}
  355. </div>
  356. </div>
  357. <div class="pb-4"></div>
  358. <div class="mt-0.5 mb-2 gap-1 flex flex-col md:flex-row justify-between">
  359. <div class="flex md:self-center text-lg font-medium px-0.5">
  360. {$i18n.t('Feedback History')}
  361. <div class="flex self-center w-[1px] h-6 mx-2.5 bg-gray-50 dark:bg-gray-850" />
  362. <span class="text-lg font-medium text-gray-500 dark:text-gray-300">{feedbacks.length}</span>
  363. </div>
  364. </div>
  365. <div
  366. class="scrollbar-hidden relative whitespace-nowrap overflow-x-auto max-w-full rounded pt-0.5"
  367. >
  368. {#if (feedbacks ?? []).length === 0}
  369. <div class="text-center text-xs text-gray-500 dark:text-gray-400 py-1">
  370. {$i18n.t('No feedbacks found')}
  371. </div>
  372. {:else}
  373. <table
  374. class="w-full text-sm text-left text-gray-500 dark:text-gray-400 table-auto max-w-full rounded"
  375. >
  376. <thead
  377. class="text-xs text-gray-700 uppercase bg-gray-50 dark:bg-gray-850 dark:text-gray-400 -translate-y-0.5"
  378. >
  379. <tr class="">
  380. <th scope="col" class="px-3 text-right cursor-pointer select-none w-0">
  381. {$i18n.t('User')}
  382. </th>
  383. <th scope="col" class="px-3 pr-1.5 cursor-pointer select-none">
  384. {$i18n.t('Models')}
  385. </th>
  386. <th scope="col" class="px-3 py-1.5 text-right cursor-pointer select-none w-fit">
  387. {$i18n.t('Result')}
  388. </th>
  389. <th scope="col" class="px-3 py-1.5 text-right cursor-pointer select-none w-0">
  390. {$i18n.t('Updated At')}
  391. </th>
  392. <th scope="col" class="px-3 py-1.5 text-right cursor-pointer select-none w-0"> </th>
  393. </tr>
  394. </thead>
  395. <tbody class="">
  396. {#each paginatedFeedbacks as feedback (feedback.id)}
  397. <tr class="bg-white dark:bg-gray-900 dark:border-gray-850 text-xs">
  398. <td class=" py-0.5 text-right font-semibold">
  399. <div class="flex justify-center">
  400. <Tooltip content={feedback?.user?.name}>
  401. <div class="flex-shrink-0">
  402. <img
  403. src={feedback?.user?.profile_image_url ?? '/user.png'}
  404. alt={feedback?.user?.name}
  405. class="size-5 rounded-full object-cover shrink-0"
  406. />
  407. </div>
  408. </Tooltip>
  409. </div>
  410. </td>
  411. <td class=" py-1 pl-3 flex flex-col">
  412. <div class="flex flex-col items-start gap-0.5 h-full">
  413. <div class="flex flex-col h-full">
  414. {#if feedback.data?.sibling_model_ids}
  415. <div class="font-semibold text-gray-600 dark:text-gray-400 flex-1">
  416. {feedback.data?.model_id}
  417. </div>
  418. <Tooltip content={feedback.data.sibling_model_ids.join(', ')}>
  419. <div class=" text-[0.65rem] text-gray-600 dark:text-gray-400 line-clamp-1">
  420. {#if feedback.data.sibling_model_ids.length > 2}
  421. <!-- {$i18n.t('and {{COUNT}} more')} -->
  422. {feedback.data.sibling_model_ids.slice(0, 2).join(', ')}, {$i18n.t(
  423. 'and {{COUNT}} more',
  424. { COUNT: feedback.data.sibling_model_ids.length - 2 }
  425. )}
  426. {:else}
  427. {feedback.data.sibling_model_ids.join(', ')}
  428. {/if}
  429. </div>
  430. </Tooltip>
  431. {:else}
  432. <div
  433. class=" text-sm font-medium text-gray-600 dark:text-gray-400 flex-1 py-1.5"
  434. >
  435. {feedback.data?.model_id}
  436. </div>
  437. {/if}
  438. </div>
  439. </div>
  440. </td>
  441. <td class="px-3 py-1 text-right font-medium text-gray-900 dark:text-white w-max">
  442. <div class=" flex justify-end">
  443. {#if feedback.data.rating.toString() === '1'}
  444. <Badge type="info" content={$i18n.t('Won')} />
  445. {:else if feedback.data.rating.toString() === '0'}
  446. <Badge type="muted" content={$i18n.t('Draw')} />
  447. {:else if feedback.data.rating.toString() === '-1'}
  448. <Badge type="error" content={$i18n.t('Lost')} />
  449. {/if}
  450. </div>
  451. </td>
  452. <td class=" px-3 py-1 text-right font-medium">
  453. {dayjs(feedback.updated_at * 1000).fromNow()}
  454. </td>
  455. <td class=" px-3 py-1 text-right font-semibold">
  456. <FeedbackMenu
  457. on:delete={(e) => {
  458. deleteFeedbackHandler(feedback.id);
  459. }}
  460. >
  461. <button
  462. class="self-center w-fit text-sm p-1.5 dark:text-gray-300 dark:hover:text-white hover:bg-black/5 dark:hover:bg-white/5 rounded-xl"
  463. >
  464. <EllipsisHorizontal />
  465. </button>
  466. </FeedbackMenu>
  467. </td>
  468. </tr>
  469. {/each}
  470. </tbody>
  471. </table>
  472. {/if}
  473. </div>
  474. {#if feedbacks.length > 10}
  475. <Pagination bind:page count={feedbacks.length} perPage={10} />
  476. {/if}
  477. <div class="pb-8"></div>
  478. {/if}