youtube.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. from typing import Any, Dict, Generator, List, Optional, Sequence, Union
  2. from urllib.parse import parse_qs, urlparse
  3. from langchain_core.documents import Document
  4. ALLOWED_SCHEMES = {"http", "https"}
  5. ALLOWED_NETLOCS = {
  6. "youtu.be",
  7. "m.youtube.com",
  8. "youtube.com",
  9. "www.youtube.com",
  10. "www.youtube-nocookie.com",
  11. "vid.plus",
  12. }
  13. def _parse_video_id(url: str) -> Optional[str]:
  14. """Parse a YouTube URL and return the video ID if valid, otherwise None."""
  15. parsed_url = urlparse(url)
  16. if parsed_url.scheme not in ALLOWED_SCHEMES:
  17. return None
  18. if parsed_url.netloc not in ALLOWED_NETLOCS:
  19. return None
  20. path = parsed_url.path
  21. if path.endswith("/watch"):
  22. query = parsed_url.query
  23. parsed_query = parse_qs(query)
  24. if "v" in parsed_query:
  25. ids = parsed_query["v"]
  26. video_id = ids if isinstance(ids, str) else ids[0]
  27. else:
  28. return None
  29. else:
  30. path = parsed_url.path.lstrip("/")
  31. video_id = path.split("/")[-1]
  32. if len(video_id) != 11: # Video IDs are 11 characters long
  33. return None
  34. return video_id
  35. class YoutubeLoader:
  36. """Load `YouTube` video transcripts."""
  37. def __init__(
  38. self,
  39. video_id: str,
  40. language: Union[str, Sequence[str]] = "en",
  41. ):
  42. """Initialize with YouTube video ID."""
  43. _video_id = _parse_video_id(video_id)
  44. self.video_id = _video_id if _video_id is not None else video_id
  45. self._metadata = {"source": video_id}
  46. self.language = language
  47. if isinstance(language, str):
  48. self.language = [language]
  49. else:
  50. self.language = language
  51. def load(self) -> List[Document]:
  52. """Load YouTube transcripts into `Document` objects."""
  53. try:
  54. from youtube_transcript_api import (
  55. NoTranscriptFound,
  56. TranscriptsDisabled,
  57. YouTubeTranscriptApi,
  58. )
  59. except ImportError:
  60. raise ImportError(
  61. 'Could not import "youtube_transcript_api" Python package. '
  62. "Please install it with `pip install youtube-transcript-api`."
  63. )
  64. try:
  65. transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
  66. except Exception as e:
  67. print(e)
  68. return []
  69. try:
  70. transcript = transcript_list.find_transcript(self.language)
  71. except NoTranscriptFound:
  72. transcript = transcript_list.find_transcript(["en"])
  73. transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
  74. transcript = " ".join(
  75. map(
  76. lambda transcript_piece: transcript_piece["text"].strip(" "),
  77. transcript_pieces,
  78. )
  79. )
  80. return [Document(page_content=transcript, metadata=self._metadata)]