o
    Wfo~                     @   s|  d dl Z d dlZd dlmZmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+Z+d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl6m8Z8 d dl9m:Z: d dl;Z;d dl<m=Z= ej>?e j>@e j>AeBd  d dlCmDZD eDE ZFe&d e jGd!< ee&d"d#d$d%ZHe ZIeFJd& d'd( ZKd)d* ZLd+d, ZMd-d. ZNd/d0 ZOd1d2 ZPd3d4 ZQd5d6 ZRd7d8 ZSd9d: ZTd;d< ZUd=d> ZVd?d@ ZWdAdB ZXdCdD ZYdEdF ZZdGdH Z[dIdJ Z\dS )K    N)	Timestampread_csv
read_excel)glob)SchemaBuilder)
ChatOpenAI)HumanMessage	AIMessage)UnstructuredHTMLLoader)UnstructuredXMLLoader)ElasticsearchStore)RecursiveCharacterTextSplitter)OpenAIEmbeddings)rmtree)
JSONLoader)Elasticsearch)	PdfReader)	natsorted)dumploaddumps)config)convert)PdfReadError)process)letter)canvas)ImageReader)FPDF)Presentation)
FileFormat)convert_file)Imagez/doc_atlog/)LogUtils
SECRET_KEYOPENAI_API_KEYES_URLi,  T)request_timeoutretry_on_timeoutzindex startedc                 C   s*  t | D ]}|dr&t j| |}t j| |dd}t || q|drDt j| |}t j| |dd}t || q|drbt j| |}t j| |dd}t || q|drt j| |}t j| |dd}t || q|d	rt j| |}t j| |d	d
}t || q|drt j| |}t j| |dd}t || q|drt j| |}t j| |dd}t || q|drt j| |}t j| |dd}t || q|drt j| |}t j| |dd}t || q|dr6t j| |}t j| |dd}t || q|drUt j| |}t j| |dd}t || q|drtt j| |}t j| |dd}t || q|drt j| |}t j| |dd}t || qd S )Nz.PDF.pdfz.DOCX.docxz.TXTz.txtz.DOC.docz.XLSX.xlsxz.CSV.csvz.TSV.tsvz.MD.mdz.PPTXz.pptxz.JSON.jsonz.HTMz.htmz.HTMLz.htmlz.XMLz.xml)oslistdirendswithpathjoinreplacerename)sharepoint_localpathfilenameold_filenew_file r<   </var/www/html/AlliBotV5_chatbot/AlliBotV50/index_attached.pyfile_format_change/   sn   







r>   c           
      C   s   t | |}tj|td}t\}}|d }|dD ]D}|drFtj||	 dd }	tj
|	rE|jt|	d|d d	dd
 |d8 }n|d|| |d8 }|dk r]|  |d }q|  t| d S )N)pagesize(   
z[IMAGE:      d      )widthheightx      )r   r   Canvasr   split
startswithr1   r4   r5   stripexists	drawImager   
drawStringshowPagesaver   )

input_filetemp_diroutput_filetextcrG   rH   yline
image_pathr<   r<   r=   convert_text_and_images_to_pdf   s&   

r\   c                 C   s   t  }|  |jddd zt| ddd}| }W d    n1 s%w   Y  W n$ tyO   t| ddd}| }W d    n1 sHw   Y  Y nw |D ]}|jdd	| d
dd qRtj	
| d }|| d d S )NArial   sizerutf-8encodingzlatin-1rF   
      C)txtlnalignr   z_txt.pdf)r   add_pageset_fontopen	readlinesUnicodeDecodeErrorcellrN   r1   r4   splitextoutput)txt_pathpdfflinesrZ   txt_path_without_extr<   r<   r=   convert_txt_to_pdf   s$   

rx   c                 C   s   t  }|  |jddd tjtjtd }|jd|dd |j	ddd | 
d	}|D ]	}|d
d| q0|| d S )NTr^   )automarginz/DejaVuSans.ttfDejaVu)fnameuni   r_   rA   r   re   )r   rk   set_auto_page_breakr1   r4   r5   dirname__file__add_fontrl   rL   
multi_cellrr   )rW   output_pathrt   	font_pathrv   rZ   r<   r<   r=   save_text_to_pdf   s   
r   c              
   C   s   z+t  }||  tj| d }|| dtj |  t	d|  d|  W dS  t
yH } zt	d|  d|  W Y d }~dS d }~ww )	Nr   z	_pptx.pdfSuccessfully converted  to TFailed to convert  to PDF. Error: F)r   LoadFromFiler1   r4   rq   
SaveToFiler    PDFDisposeprint	Exception)	pptx_pathpresentationpdf_pather<   r<   r=   pptx_to_pdf   s   
r   c                 C   s6   t | d|dgd}|dksJ td|  d|  d S )Nrt   --pdf-engine=xelatex
outputfile
extra_args z
Converted r   )r!   r   )md_file_pathoutput_file_pathrr   r<   r<   r=   convert_md_to_pdf   s   r   c                 C   s   t |  g }tdD ]V}zt|}t|jdkrtd|| W q tyF } ztd| d|  t	
d|  W Y d }~qd }~w tya } ztd| d|  W Y d }~qd }~ww |S )Nz*.pdfr   z
Empty filezError reading z: zError reading %sz"An unexpected error occurred with )r1   chdirr   r   lenpagesr   appendr   logger	exceptionr   )r4   pdfsfilereaderr   r<   r<   r=   readfiles_pdf   s"   
 r   c                 C   sB   t | d}t| dW  d    S 1 sw   Y  d S )Nrbrb   )rm   base64	b64encodereaddecode)r[   
image_filer<   r<   r=   encode_image   s   $r   c                 C   s   t | J}d}||t j |jdkr/|d}| dd}|j|dd |W  d    S | d|j  d	|j  }|| |W  d    S 1 sRw   Y  d S )
N)   r   BMPRGBz.bmpz.jpegJPEG)format.z	.resized.)	r"   rm   	thumbnailLANCZOSr   r   r6   rS   lower)r[   imgmax_sizeconverted_pathr<   r<   r=   resize_and_convert_image   s   

$
$r   c              
   C   sh   zt | d|dgd td|  d|  W dS  ty3 } ztd|  d|  W Y d }~d	S d }~ww )
Nrt   r   r   r   r   Tr   r   F)r!   r   r   )	docx_pathr   r   r<   r<   r=   convert_docx_to_pdf   s   r   c                 C   s4   i }|D ]}| | }t |tr| }|||< q|S )zU
    Convert a row to a JSON object with special handling for Timestamp objects.
    )
isinstancer   	isoformat)rowcolumn_namesdatacol_namevaluer<   r<   r=   row_to_json  s   

r   c                 C   s   dg i}t | }t|jD ]\}}| }|d d|i qW d    n1 s+w   Y  t|ddd}t||ddd W d    d S 1 sKw   Y  d S )	Nr   rW   wrb   rc   F   )ensure_asciiindent)
pdfplumberrm   	enumerater   extract_textr   r   )input_pdf_pathoutput_json_pathpdf_contentrt   page_numpagerW   	json_filer<   r<   r=   pdf_to_json_new  s   	"r   c              	   C   s   t | }| D ]:\}}t||}d|gi}| d|dd  d| d}t|d}	t||	dd	 W d
   n1 s=w   Y  q|d}
|
j| dd d
S )z[
    Convert each row with specific columns in an Excel file to individual JSON files.
    content/r,   r   _r0   r   r   r   NFindex)r   iterrowsr   rL   rm   r   headto_excel)r9   r   output_folderbot_file_namedfr   r   	json_dataoutput_filenameru   	df_headerr<   r<   r=   excel_to_json&  s   

 
r   c                 C   sZ   t | d}t|}W d    n1 sw   Y  t }|| | }t|dd}|S )Nra      r   )rm   r   r   
add_object	to_schemar   )	file_pathr   r   builderschemaschema_stringr<   r<   r=   generate_schema_from_json7  s   

r   c           	      C   s   g }t | D ]G}|drNz"t j| |}t|}t||dd}| }|| |	| W q t
yM } ztd|  t| W Y d }~qd }~ww q|S )Nr0   F)	jq_schematext_contentzjson conversation failed %s)r1   r2   r3   r4   r5   r   r   r   extendr   r   r   infor   )	directory_pathsuccess_file_list	documentsr9   r   r   loaderdocsr   r<   r<   r=   load_json_directoryJ  s"   

r   c                 C   s  t j|st | t | D ]n}|dr~t j| |}t j|d d }t j||}t||}|srt	
d|  zt||| || W q tyq }	 zt \}
}}t	d|  t	
d|  W Y d }	~	qd }	~	ww t	
d|  || qd S )Nr*   r   r)   z%second docx conversation satisfied %sException  %sz docx to pdf conversion failed %sz$first docx conversation satisfied %s)r1   r4   rO   makedirsr2   r3   r5   rq   r   r   r   r\   r   r   sysexc_infor   )input_directoryoutput_directoryr   rU   r9   input_file_pathoutput_file_namer   conv_statusexexc_type	exc_valueexc_tracebackr<   r<   r=   batch_convert_docx_to_pdf_  s.   



r  c                 C   s6   g }| D ]}t |dsi |_||jd< || q|S )Nmetadata
session_id)hasattrr  r   )r   r  updated_documentsdocr<   r<   r=   add_session_id_to_documentsw  s   

r	  c           B         s  zt jt jtd }t j|std t | g }d}t 	| }t
| d }|t
| d  }t
| d }|t
| d  }t|  t| | || z t
| d }	|	D ]}
t|
|
d	d
 ||
dd  qWW n) ty } zt \}}}td|  td|
  d}W Y d }~nd }~ww zt
| d }|D ]}t| ||dd  qW n) ty } zt \}}}td|  td|  d}W Y d }~nd }~ww t
| d }|D ]!}t|}|std|  td|  q||dd  qt
| d }|D ]C}zt||dd ||dd  W q tyR } zt \}}}td|  td|  d}W Y d }~qd }~ww t jt jtd }t| }t j|dd g  |D ]}t j| |}t|}dg i}z]tt|jD ]}|j| }| }|d | qt|dd}|dd }t j||}t |d!} | !| W d    n	1 sw   Y  td"|   |d#d  W qn ty9 }! zPz%|dd }t j||}"t"||" td$|   |d#d  W n# ty- }! ztd%|  td&|!  d}W Y d }!~!nd }!~!ww W Y d }!~!qnd }!~!ww t#t
|d' D ]:}#t$d(|# t%|#d)d*d+}$|$& }%t$d,|% |%r{t'|%|}&t$d- t(d.t j)d/< t* }'t+j,|%t-||'t+. d0}(qBtd1 t
| d2 })|)D ]K}
zt/|
d3d4}*|*j0|
d5d6d*d7 ||
dd  W q ty } zt \}}}td|  td8|
  d}W Y d }~qd }~ww t
| d9 }+|+D ]L}
zt/|
d:d3d;}*|*j0|
d<d=d*d7 ||
dd  W q ty* } zt \}}}td|  td8|
  d}W Y d }~qd }~ww t jt jtd> },t j|,sCt |, td? t
| d }-g |-D ]L}.zt1|.}/|/j23 }0|.dd }1t4|.|0|,|1 |1 W qR ty } zt \}}}td|  td8|.  d}W Y d }~qRd }~ww t#t
|,d' D ][}#zt%|#d@d*d+}$|$& }2W n, ty } zt \}}}td|  td8|#  d}g }2W Y d }~nd }~ww |2rt'|2|}3t(d.t j)d/< t* }'t+j,|2t-||'t+. d0}(qtdA z%t5| |}4|4r,t'|4|}5t(d.t j)d/< t* }'t+j,|4t-||'t+. d0}(W n tyG }6 zt|6 d}W Y d }6~6nd }6~6ww t
| dB }7g }8|7D ]Y}#z6t6|#}$|87|$&  ||#dd  |8rt'|8|}9t(d.t j)d/< t* }'t+j,|8t-||'t+. d0}(W qR ty }6 zd}tdC|# dD|6  W Y d }6~6qRd }6~6ww t
| dE }:g };|:D ]Y}#z6t6|#}$|;7|$&  ||#dd  |;rt'|;|}<t(d.t j)d/< t* }'t+j,|;t-||'t+. d0}(W q ty }6 ztdC|# dD|6  d}W Y d }6~6qd }6~6ww t
| dF }=g }>|=D ]Y}#z6t8|#}$|>7|$&  ||#dd  |>rPt'|>|}?t(d.t j)d/< t* }'t+j,|>t-||'t+. d0}(W q tys }6 zd}tdC|# dD|6  W Y d }6~6qd }6~6ww tdG  fdHdI|D }@fdJdI|D }A||@ |A }tdK|  t|t|krd}t9| t9|, tdL |W S  ty }! ztd&|!  W Y d }!~!dS d }!~!ww )MNz/temp_imagesztemp img path doesn't existrf   z/*.pdfz/*.PDFz/*.xlsxz/*.XLSXz/*.docr+   _docr   rC   r   zdoc to pdf conversion failed %sr   z/*.txtztxt to pdf conversion failed %sz/*.pptxzException occured at %sz pptx to pdf conversion failed %sz/*.mdr/   z_md.pdfzmd to pdf conversion failed %sz/pdf_json_convT)exist_okr   r   r   r)   r0   r   zpypdf satisfied %sr   zpdfplumber satisfied %szexception occured pdf %szException occured due to %sz/*.jsonzentered jsonz.pages[]F)r   r   r   json_documentszsession id added to pdfr$   r%   )r   es_connection
index_name	embeddingstrategyzpdf index addedz/*.csvlatin1rc   r-   z	_csv.xlsxr   z csv to xlsx conversion failed %sz/*.tsv	)seprd   r.   z	_tsv.xlsxz/excel_json_convzentered excel conversion partz
.content[]zexcel index addedz/*.htmlzException occured in z and the error is z/*.htmz/*.xmlzfiles are indexedc                    s6   g | ]}| d d  dd  v r| d d qS )r   rC   r   r   rL   .0i)pdf_successr<   r=   
<listcomp>  s   6 z"start_indexing.<locals>.<listcomp>c                    s,   g | ]}| d d  v r| d d qS )r   rC   r  r  )
xl_successr<   r=   r    s   , zsuccess_file_list %szfiles removed):r1   r4   r5   r   r   isdirr   r   mkdirr2   r   r>   r  r   r6   r   rL   r   r   r   r   rx   r   r   r   r   r   ranger   r   r   r   rm   writer   r   r   r   r   r	  r   environr   r   from_documentsesExactRetrievalStrategyr   r   r   columnstolistr   r   r
   r   r   r   )Br8   	indexnamer  rU   r   status_codeorg_filelistorg_pdffilesorg_excelfiles	doc_filesfier   r   r  r  	txt_filesr   
pptx_files	pptx_convmd_filespdf_json_pathr   pdf_filer   r   pdf_datar   r   rW   pdf_jsonjson_filename	json_pathr   r   r   r  r   r  json_documents_with_session_id
embeddingsdbcsv_filer   tsv_filebot_json_path
xlsx_filesr   	dataframer#  r   excel_documentsexcel_documents_with_session_idJSON_others#only_json_documents_with_session_iderror	html_filehtml_loaded_documentshtml_documents_with_session_idhtm_filehtm_loaded_documentshtm_documents_with_session_idxml_filexml_loaded_documentsxml_documents_with_session_idpdf_org_succex_org_succr<   )r  r  r=   start_indexing  sH  


	





	








$


$

rN  )]r1   r   pandasr   r   r   r   gensonr   langchain.chat_modelsr   langchain.schema.messagesr   r	   $langchain_community.document_loadersr
   r    langchain_community.vectorstoresr   langchain.text_splitterr   langchain_openair   shutilr   langchain.document_loadersr   elasticsearchr   PyPDF2r   natsortr   jsonr   r   r   decoupler   doc2pdfr   PyPDF2.errorsr   r   docx2txtr   reportlab.lib.pagesizesr   reportlab.pdfgenr   reportlab.lib.utilsr   fpdfr   spire.presentationr   r    pypandocr!   r   PILr"   r4   r   r5   r   r   	doc_atlogr#   getRootLoggerr   r  r!  r7  r   r>   r\   rx   r   r   r   r   r   r   r   r   r   r   r   r   r  r	  rN  r<   r<   r<   r=   <module>   sp     
U	
