o
    Zh                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ G dd	 d	eeZdS )
    N)ABC)Path)IteratorListSetTuple)Document)BaseBlobParser)Blobc                   @   s   e Zd ZdZdedee fddZdedee fddZde	j
d	edeeeeef  fd
dZdede	j
dee dee dee f
ddZdS )
VsdxParserzParser for vsdx files.blobreturnc                 C   s
   |  |S )zParse a vsdx file.)
lazy_parse)selfr    r   h/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/parsers/vsdx.pyparse   s   
zVsdxParser.parsec              	   #   s       %}t|d}| | j}W d   n1 sw   Y  W d   n1 s-w   Y   fdd|D E dH  dS )zoRetrieve the contents of pages from a .vsdx file
        and insert them into documents, one document per page.rNc                    s(   g | ]\}}}t | j||d dqS ))sourcepage	page_name)page_contentmetadata)r   r   ).0page_numberr   r   r   r   r   
<listcomp>   s    	z)VsdxParser.lazy_parse.<locals>.<listcomp>)Zas_bytes_iozipfileZipFileget_pages_contentr   )r   r   Zpdf_file_objzfileZpagesr   r   r   r      s   

	zVsdxParser.lazy_parser    r   c                    s~  zddl W n ty   tdw d vr!td| dS d vr0td| dS d vr?td	| dS d}d}d}t|d
 d trldd |d
 d D  n|d
 d d 	 g t|d d trdd |d d D nd|d d d  g|d d d d dt
  }dd |D } fdd|D }g }|D ]@}	t|	}
td|
}t
|dkrd|}dddddd d!}| D ]
\}}|||}q||d" qfd#d|D }g }tt||D ],\}\}| ||dfd$d|D fd%d|D  }||||f q|S )&a  Get the content of the pages of a vsdx file.

        Attributes:
            zfile (zipfile.ZipFile): The vsdx file under zip format.
            source (str): The path of the vsdx file.

        Returns:
            list[tuple[int, str, str]]: A list of tuples containing the page number,
            the name of the page and the content of the page
            for each page of the vsdx file.
        r   NzfThe xmltodict library is required to parse vsdx files. Please install it with `pip install xmltodict`.zvisio/pages/pages.xmlz'WARNING - No pages.xml file found in {}z visio/pages/_rels/pages.xml.relsz,WARNING - No pages.xml.rels file found in {}zdocProps/app.xmlz%WARNING - No app.xml file found in {}ZPagesZPagec                 S   s   g | ]}|d    qS )@Namestripr   relr   r   r   r   P   s    z0VsdxParser.get_pages_content.<locals>.<listcomp>r!   RelationshipsRelationshipc                 S   s   g | ]}d |d  qS )visio/pages/@Targetr   r$   r   r   r   r   X   s    
r(   r)   
PropertiesZTitlesOfPartsz	vt:vectorzvt:lpstrc                 S   s   g | ]}|  qS r   r"   r   namer   r   r   r   d   s    c                    s   g | ]}  |  qS r   )indexr#   r+   )disordered_namesdisordered_pathsr   r   r   e   s    z("#text"\s*:\s*"([^\\"]*(?:\\.[^\\"]*)*)"
	-'   é   ô)z\nz\tz\u2013z\u2019z\u00e9rz\u00f4mer   r   c              
      sJ   g | ]!}d t |j d v r| d t |j ddqS )zvisio/pages/_rels/z	.xml.rels)pathcontent)r   stemnamelistr   read)r   Z	page_path)	xmltodictr    r   r   r      s    c                    s    g | ]}|d   v r|d qS r6   r   r   Zpage_)relationshipsr   r   r      
    c                    s    g | ]}|d   kr|d qS r6   r   r=   )r7   r   r   r      r?   )r<   ImportErrorr:   printformatr   r;   
isinstancelistr#   lenjsondumpsrefindalljoinitemsreplaceappend	enumeratezipget_relationships)r   r    r   Zpagesxml_contentZappxml_contentZpagesxmlrels_contentZordered_namesZordered_pathsZdisordered_pagesr8   Zstring_contentZsamplesr   Zmap_symboleskeyvaluepagexml_relsZordered_pagesr   r   r   )r.   r/   r7   r>   r<   r    r   r   (   s   







zVsdxParser.get_pages_contentr   filelistrS   c                    s   t  j}t  jd| d }t|| vrt S t fdd|D }t|d d tr=dd |d d D }n	|d d d	 g}tfd
d|D 	|}	|	D ]}
|	| 
|
|||B }	qV|	S )a  Get the relationships of a page and the relationships of its relationships,
        etc... recursively.
        Pages are based on other pages (ex: background page),
        so we need to get all the relationships to get all the content of a single page.
        z_rels/z.relsc                 3   s$    | ]}|d   kr|d V  qdS )r7   r8   Nr   r=   )r   r   r   	<genexpr>   s    z/VsdxParser.get_relationships.<locals>.<genexpr>r&   r'   c                 S   s   g | ]}|d  qS )r)   r   r$   r   r   r   r      s    z0VsdxParser.get_relationships.<locals>.<listcomp>r)   c                    s   g | ]}t  | qS r   )str)r   target)parent_pathr   r   r      s    )r   r,   parentrV   r:   setnextrC   rD   intersectionrP   )r   r   r    rT   rS   Z	name_pathZ	rels_pathZpagexml_rels_contenttargetsr>   r%   r   )r   rX   r   rP      s.   


zVsdxParser.get_relationshipsN)__name__
__module____qualname____doc__r
   r   r   r   r   r   r   rV   r   r   intr   dictr   rP   r   r   r   r   r      s,    
~r   )rF   rH   r   abcr   pathlibr   typingr   r   r   r   Z%langchain_community.docstore.documentr   Z)langchain_community.document_loaders.baser	   Z1langchain_community.document_loaders.blob_loadersr
   r   r   r   r   r   <module>   s    