youtube.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. import logging
  2. from typing import Any, Dict, Generator, List, Optional, Sequence, Union
  3. from urllib.parse import parse_qs, urlparse
  4. from langchain_core.documents import Document
  5. from open_webui.env import SRC_LOG_LEVELS
  6. log = logging.getLogger(__name__)
  7. log.setLevel(SRC_LOG_LEVELS["RAG"])
  8. ALLOWED_SCHEMES = {"http", "https"}
  9. ALLOWED_NETLOCS = {
  10. "youtu.be",
  11. "m.youtube.com",
  12. "youtube.com",
  13. "www.youtube.com",
  14. "www.youtube-nocookie.com",
  15. "vid.plus",
  16. }
  17. def _parse_video_id(url: str) -> Optional[str]:
  18. """Parse a YouTube URL and return the video ID if valid, otherwise None."""
  19. parsed_url = urlparse(url)
  20. if parsed_url.scheme not in ALLOWED_SCHEMES:
  21. return None
  22. if parsed_url.netloc not in ALLOWED_NETLOCS:
  23. return None
  24. path = parsed_url.path
  25. if path.endswith("/watch"):
  26. query = parsed_url.query
  27. parsed_query = parse_qs(query)
  28. if "v" in parsed_query:
  29. ids = parsed_query["v"]
  30. video_id = ids if isinstance(ids, str) else ids[0]
  31. else:
  32. return None
  33. else:
  34. path = parsed_url.path.lstrip("/")
  35. video_id = path.split("/")[-1]
  36. if len(video_id) != 11: # Video IDs are 11 characters long
  37. return None
  38. return video_id
  39. class YoutubeLoader:
  40. """Load `YouTube` video transcripts."""
  41. def __init__(
  42. self,
  43. video_id: str,
  44. language: Union[str, Sequence[str]] = "en",
  45. proxy_url: Optional[str] = None,
  46. ):
  47. """Initialize with YouTube video ID."""
  48. _video_id = _parse_video_id(video_id)
  49. self.video_id = _video_id if _video_id is not None else video_id
  50. self._metadata = {"source": video_id}
  51. self.language = language
  52. self.proxy_url = proxy_url
  53. if isinstance(language, str):
  54. self.language = [language]
  55. else:
  56. self.language = language
  57. def load(self) -> List[Document]:
  58. """Load YouTube transcripts into `Document` objects."""
  59. try:
  60. from youtube_transcript_api import (
  61. NoTranscriptFound,
  62. TranscriptsDisabled,
  63. YouTubeTranscriptApi,
  64. )
  65. except ImportError:
  66. raise ImportError(
  67. 'Could not import "youtube_transcript_api" Python package. '
  68. "Please install it with `pip install youtube-transcript-api`."
  69. )
  70. if self.proxy_url:
  71. youtube_proxies = {
  72. "http": self.proxy_url,
  73. "https": self.proxy_url,
  74. }
  75. # Don't log complete URL because it might contain secrets
  76. log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
  77. else:
  78. youtube_proxies = None
  79. try:
  80. transcript_list = YouTubeTranscriptApi.list_transcripts(
  81. self.video_id, proxies=youtube_proxies
  82. )
  83. except Exception as e:
  84. log.exception("Loading YouTube transcript failed")
  85. return []
  86. try:
  87. transcript = transcript_list.find_transcript(self.language)
  88. except NoTranscriptFound:
  89. transcript = transcript_list.find_transcript(["en"])
  90. transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
  91. transcript = " ".join(
  92. map(
  93. lambda transcript_piece: transcript_piece["text"].strip(" "),
  94. transcript_pieces,
  95. )
  96. )
  97. return [Document(page_content=transcript, metadata=self._metadata)]