o
    ZhIH                     @  s  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ eeZdgZ eG dd dZ!ddhZ"h dZ#dddZ$G dd deZ%G dd deZ&eG dd deZ'dS ) zLoads YouTube transcript.    )annotationsN)Enum)Path)AnyDict	GeneratorListOptionalSequenceUnion)parse_qsurlparse)
ParseError)Document)model_validator)	dataclass)
BaseLoaderz0https://www.googleapis.com/auth/youtube.readonlyc                   @  s   e Zd ZU dZe d d Zded< e d d Zded< e d d Z	ded< dddZ
eddedddZdddZdS )GoogleApiClienta  Generic Google API Client.

    To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
    python package installed.
    As the google api expects credentials you need to set up a google account and
    register your Service. "https://developers.google.com/docs/api/quickstart/python"

    *Security Note*: Note that parsing of the transcripts relies on the standard
        xml library but the input is viewed as trusted in this case.


    Example:
        .. code-block:: python

            from langchain_community.document_loaders import GoogleApiClient
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )

    z.credentialszcredentials.jsonr   credentials_pathservice_account_pathz
token.json
token_pathreturnNonec                 C  s   |   | _d S N)_load_credentialscredsself r   c/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/youtube.py__post_init__2   s   zGoogleApiClient.__post_init__beforemodevaluesr   c                 C  &   |j ds|j dstd|j S )DValidate that either folder_id or document_ids is set, but not both.r   r   -Must specify either channel_name or video_idskwargsget
ValueErrorclsr$   r   r   r   #validate_channel_or_videoIds_is_set5   s
   z3GoogleApiClient.validate_channel_or_videoIds_is_setc           	      C  s  z ddl m} ddlm} ddlm} ddlm} ddlm	} W n t
y+   t
dw d}| j r<|jt| jS | j rJ|t| jt}|rO|js|r^|jr^|jr^||  n|t| jt}|jdd	}t| jd
}||  W d   |S 1 sw   Y  |S )zLoad credentials.r   )Request)service_account)Credentials)InstalledAppFlowYouTubeTranscriptApiYou must run`pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib youtube-transcript-api` to use the Google Drive loaderN)portw)Zgoogle.auth.transport.requestsr/   Zgoogle.oauth2r0   Zgoogle.oauth2.credentialsr1   Zgoogle_auth_oauthlib.flowr2   youtube_transcript_apir4   ImportErrorr   existsZfrom_service_account_filestrr   Zfrom_authorized_user_fileSCOPESZvalidZexpiredZrefresh_tokenrefreshZfrom_client_secrets_filer   Zrun_local_serveropenwriteto_json)	r   r/   r0   r1   r2   r4   r   Zflowtokenr   r   r   r   @   s>   





z!GoogleApiClient._load_credentialsNr   r   r$   r   r   r   )r   r   )__name__
__module____qualname____doc__r   homer   __annotations__r   r   r    r   classmethodr.   r   r   r   r   r   r      s   
 
	r   httphttps>   zyoutube.comzwww.youtube.comzm.youtube.comzvid.pluszwww.youtube-nocookie.comzyoutu.beurlr;   r   Optional[str]c                 C  s   t | }|jtvrdS |jtvrdS |j}|dr7|j}t|}d|v r5|d }t	|t
r0|n|d }ndS |jd}|dd }t|dkrLdS |S )zEParse a YouTube URL and return the video ID if valid, otherwise None.Nz/watchvr   /   )r   schemeALLOWED_SCHEMESnetlocALLOWED_NETLOCSpathendswithqueryr   
isinstancer;   lstripsplitlen)rM   
parsed_urlrW   rY   Zparsed_queryZidsvideo_idr   r   r   _parse_video_idt   s$   


r`   c                   @  s   e Zd ZdZdZdZdZdS )TranscriptFormatz3Output formats of transcripts from `YoutubeLoader`.textlineschunksN)rD   rE   rF   rG   TEXTLINESCHUNKSr   r   r   r   ra      s
    ra   c                   @  sn   e Zd ZdZdddejddfd-ddZed.ddZe	d/ddZ
d0d!d"Zd1d%d&Zd2d(d)Zd3d+d,ZdS )4YoutubeLoaderz!Load `YouTube` video transcripts.FenNx   r_   r;   add_video_infoboollanguageUnion[str, Sequence[str]]translationrN   transcript_formatra   continue_on_failurechunk_size_secondsintc                 C  sR   || _ d|i| _|| _|| _t|tr|g| _n|| _|| _|| _|| _|| _	dS )z!Initialize with YouTube video ID.sourceN)
r_   	_metadatark   rm   rZ   r;   ro   rp   rq   rr   )r   r_   rk   rm   ro   rp   rq   rr   r   r   r   __init__   s   



zYoutubeLoader.__init__youtube_urlr   c                 C  s    t | }|std|  d|S )z*Extract video ID from common YouTube URLs.z.Could not determine the video ID for the URL "z".)r`   r+   )rw   r_   r   r   r   extract_video_id   s   
zYoutubeLoader.extract_video_idr)   r   c                 K  s   |  |}| |fi |S )z|Given a YouTube URL, construct a loader.
        See `YoutubeLoader()` constructor for a list of keyword arguments.
        )rx   )r-   rw   r)   r_   r   r   r   from_youtube_url   s   
zYoutubeLoader.from_youtube_urlchunk_pieces
List[Dict]chunk_start_secondsr   c              
   C  sp   t |d\}}t |d\}}tdtdd |i | j||dd|dd|dd| j d| d	d
dS )z0Create Document from chunk of transcript pieces.<    c                 S     | d  dS Nrb   r~   strip)Zchunk_piecer   r   r   <lambda>       z4YoutubeLoader._make_chunk_document.<locals>.<lambda>02d: https://www.youtube.com/watch?v=z&t=s)Zstart_secondsZstart_timestamprt   page_contentmetadata)divmodr   joinmapru   r_   )r   rz   r|   mr   hr   r   r   _make_chunk_document   s    
z"YoutubeLoader._make_chunk_documenttranscript_piecesGenerator[Document, None, None]c                 c  s    g }d}| j }|D ]%}|d |d  }||kr*|r!| ||V  g }|}|| j 7 }|| q
t|dkr?| ||V  d S d S )Nr   startduration)rr   r   appendr]   )r   r   rz   r|   Zchunk_time_limittranscript_pieceZ	piece_endr   r   r   _get_transcript_chunks   s    
z$YoutubeLoader._get_transcript_chunksList[Document]c                 C  s0  zddl m}m}m} W n ty   tdw | jr%|  }| j| z|	| j
}W n |y8   g  Y S w z|| j}W n |yP   |dg}Y nw | jdur\|| j}| }| jtjkrxdtdd |}t|| jd	gS | jtjkrttd
d |S | jtjkrt| |S td)z1Load YouTube transcripts into `Document` objects.r   )NoTranscriptFoundTranscriptsDisabledr4   zvCould not import "youtube_transcript_api" Python package. Please install it with `pip install youtube-transcript-api`.ri   Nr~   c                 S  r   r   r   r   r   r   r   r     r   z$YoutubeLoader.load.<locals>.<lambda>r   c                 S  s(   t | d dttdd |  dS )Nrb   r~   c                 S  s   | d dkS )Nr   rb   r   )itemr   r   r   r   "  s    z6YoutubeLoader.load.<locals>.<lambda>.<locals>.<lambda>r   )r   r   dictfilteritemsr   r   r   r   r     s    zUnknown transcript format.)r8   r   r   r4   r9   rk   _get_video_inforu   updatelist_transcriptsr_   find_transcriptrm   ro   	translatefetchrp   ra   re   r   r   r   rf   listrg   r   r+   )r   r   r   r4   
video_infotranscript_list
transcriptr   r   r   r   load   sR   
zYoutubeLoader.loadr   c                 C  s   zddl m} W n ty   tdw |d| j }|jp d|jp$d|jp(d|jp,d|jr5|j	dnd|j
p:d|jp>dd}|S )zGet important video information.

        Components include:
            - title
            - description
            - thumbnail URL,
            - publish_date
            - channel author
            - and more.
        r   )YouTubezVCould not import "pytube" Python package. Please install it with `pip install pytube`.r   Unknownz%Y-%m-%d %H:%M:%S)titledescriptionZ
view_countthumbnail_urlpublish_datelengthauthor)Zpytuber   r9   r_   r   r   Zviewsr   r   strftimer   r   )r   r   Zytr   r   r   r   r   /  s&   zYoutubeLoader._get_video_info)r_   r;   rk   rl   rm   rn   ro   rN   rp   ra   rq   rl   rr   rs   )rw   r;   r   r;   )rw   r;   r)   r   r   rh   )rz   r{   r|   rs   r   r   )r   r{   r   r   r   r   )r   r   )rD   rE   rF   rG   ra   re   rv   staticmethodrx   rJ   ry   r   r   r   r   r   r   r   r   rh      s"    	


?rh   c                   @  s   e Zd ZU dZded< dZded< dZded< d	Zd
ed< dZded< dZ	d
ed< d0ddZ
d1ddZedded2ddZd3dd Zd4d#d$Zd5d%d&Zd6d(d)Zd7d,d-Zd8d.d/ZdS )9GoogleApiYoutubeLoadera  Load all Videos from a `YouTube` Channel.

    To use, you should have the ``googleapiclient,youtube_transcript_api``
    python package installed.
    As the service needs a google_api_client, you first have to initialize
    the GoogleApiClient.

    Additionally you have to either provide a channel name or a list of videoids
    "https://developers.google.com/docs/api/quickstart/python"



    Example:
        .. code-block:: python

            from langchain_community.document_loaders import GoogleApiClient
            from langchain_community.document_loaders import GoogleApiYoutubeLoader
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )
            loader = GoogleApiYoutubeLoader(
                google_api_client=google_api_client,
                channel_name = "CodeAesthetic"
            )
            load.load()

    r   google_api_clientNrN   channel_namezOptional[List[str]]	video_idsTrl   rk   ri   r;   captions_languageFrq   r   r   c                 C  s   |  | jj| _d S r   )_build_youtube_clientr   r   youtube_clientr   r   r   r   r    v  s   z$GoogleApiYoutubeLoader.__post_init__r   r   c                 C  sB   zddl m} ddlm} W n ty   tdw |dd|dS )Nr   )buildr3   r5   ZyoutubeZv3)credentials)Zgoogleapiclient.discoveryr   r8   r4   r9   )r   r   r   r4   r   r   r   r   y  s   
z,GoogleApiYoutubeLoader._build_youtube_clientr!   r"   r$   c                 C  r%   )r&   r   r   r'   r(   r,   r   r   r   r.     s   z:GoogleApiYoutubeLoader.validate_channel_or_videoIds_is_setr_   c                 C  sr   ddl m}m} ||}z	|| jg}W n |y*   |D ]}|| j}qY nw | }ddd |D S )Nr   )r   r4   r~   c                 S  s   g | ]	}|d   dqS )rb   r~   r   ).0tr   r   r   
<listcomp>  s    zGGoogleApiYoutubeLoader._get_transcripe_for_video_id.<locals>.<listcomp>)	r8   r   r4   r   r   r   r   r   r   )r   r_   r   r4   r   r   Zavailable_transcriptr   r   r   r   _get_transcripe_for_video_id  s   
z3GoogleApiYoutubeLoader._get_transcripe_for_video_idr)   r   c                 K  s8   |  |}| j jd|d }t||dd dS )N
id,snippetpartidr   r   r   )r   r   Zvideosr   executer   r*   )r   r_   r)   ZcaptionsZvideo_responser   r   r   _get_document_for_video_id  s   
z1GoogleApiYoutubeLoader._get_document_for_video_idc                 C  s8   | j  jd|ddd}| }|d d d d }|S )Nr   channel   )r   qtype
maxResultsr   r   Z	channelId)r   searchr   r   )r   r   requestresponse
channel_idr   r   r   _get_channel_id  s   
z&GoogleApiYoutubeLoader._get_channel_idr   c                 C  s4   | j  jd|d}| }|d d d d d S )NZcontentDetailsr   r   r   ZrelatedPlaylistsZuploads)r   Zchannelsr   r   )r   r   r   r   r   r   r   _get_uploads_playlist_id  s   
z/GoogleApiYoutubeLoader._get_uploads_playlist_idr   r   c                 K  sH  z
ddl m}m} W n ty   tdw | |}| |}| j jd|dd}g }|d ur|	 }	|	d D ][}
|
d d	 d
 }d
|i}| j
rX|
d d ||
d  z| |}|t||d W q9 ||tfy } z| jrtdd|
d d
  d|   n|W Y d }~q9d }~ww | j ||	}|d us1|S )Nr   )r   r   zTYou must run`pip install --upgrade youtube-transcript-api` to use the youtube loaderr   2   )r   Z
playlistIdr   r   ZsnippetZ
resourceIdZvideoIdZ
thumbnailsr   zError fetching transscript r~   r   z, exception: )r8   r   r   r9   r   r   r   ZplaylistItemsr   r   rk   popr   r   r   r   r   rq   loggererrorr   Z	list_next)r   r   r)   r   r   r   Zuploads_playlist_idr   r   r   r   r_   Z	meta_datar   er   r   r   _get_document_for_channel  sZ   



	z0GoogleApiYoutubeLoader._get_document_for_channelc                   sL   g } j r|  j  |S  jr"| fdd jD  |S td)zLoad documents.c                   s   g | ]}  |qS r   )r   )r   r_   r   r   r   r     s    z/GoogleApiYoutubeLoader.load.<locals>.<listcomp>r'   )r   extendr   r   r+   )r   Zdocument_listr   r   r   r     s   

zGoogleApiYoutubeLoader.loadrB   )r   r   r   r   rC   )r_   r;   r   r;   )r_   r;   r)   r   r   r   )r   r;   r   r;   )r   r;   r   r;   )r   r;   r)   r   r   r   r   )rD   rE   rF   rG   rI   r   r   rk   r   rq   r    r   r   rJ   r.   r   r   r   r   r   r   r   r   r   r   r   Q  s&   
 






5r   )rM   r;   r   rN   )(rG   
__future__r   loggingenumr   pathlibr   typingr   r   r   r   r	   r
   r   urllib.parser   r   Zxml.etree.ElementTreer   Zlangchain_core.documentsr   Zpydanticr   Zpydantic.dataclassesr   Z)langchain_community.document_loaders.baser   	getLoggerrD   r   r<   r   rT   rV   r`   ra   rh   r   r   r   r   r   <module>   s0    $
Q

 8