o
    '®f  ã                   @   s|  d dl Z d dlZd dlmZmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+Z+d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl6m8Z8 d dl9m:Z: d dl;Z;d dl<m=Z= ej> ?e j> @e j> AeB¡¡d ¡ d dlCmDZD eD E¡ ZFe&d ƒe jGd!< ee&d"ƒd#d$d%ZHeƒ ZIeF Jd&¡ d'd(„ ZKd)d*„ ZLd+d,„ ZMd-d.„ ZNd/d0„ ZOd1d2„ ZPd3d4„ ZQd5d6„ ZRd7d8„ ZSd9d:„ ZTd;d<„ ZUd=d>„ ZVd?d@„ ZWdAdB„ ZXdCdD„ ZYdEdF„ ZZdGdH„ Z[dIdJ„ Z\dS )Ké    N)Ú	TimestampÚread_csvÚ
read_excel)Úglob)ÚSchemaBuilder)Ú
ChatOpenAI)ÚHumanMessageÚ	AIMessage)ÚUnstructuredHTMLLoader)ÚUnstructuredXMLLoader)ÚElasticsearchStore)ÚRecursiveCharacterTextSplitter)ÚOpenAIEmbeddings)Úrmtree)Ú
JSONLoader)ÚElasticsearch)Ú	PdfReader)Ú	natsorted)ÚdumpÚloadÚdumps)Úconfig)Úconvert)ÚPdfReadError)Úprocess)Úletter)Úcanvas)ÚImageReader)ÚFPDF)ÚPresentation)Ú
FileFormat)Úconvert_file)ÚImagez/attach_ind_log/)ÚLogUtilsÚ
SECRET_KEYÚOPENAI_API_KEYÚES_URLi,  T)Úrequest_timeoutÚretry_on_timeoutzindex startedc                 C   s"  t  | ¡D ]}| d¡r&t j | |¡}t j | | dd¡¡}t  ||¡ q| d¡rDt j | |¡}t j | | dd¡¡}t  ||¡ q| d¡rbt j | |¡}t j | | dd¡¡}t  ||¡ q| d¡r€t j | |¡}t j | | dd¡¡}t  ||¡ q| d	¡ržt j | |¡}t j | | d	d
¡¡}t  ||¡ q| d¡r¼t j | |¡}t j | | dd¡¡}t  ||¡ q| d¡rÚt j | |¡}t j | | dd¡¡}t  ||¡ q| d¡røt j | |¡}t j | | dd¡¡}t  ||¡ q| d¡rt j | |¡}t j | | dd¡¡}t  ||¡ q| d¡r6t j | |¡}t j | | dd¡¡}t  ||¡ q| d¡rUt j | |¡}t j | | dd¡¡}t  ||¡ q| d¡rtt j | |¡}t j | | dd¡¡}t  ||¡ q| d¡r“t j | |¡}t j | | dd¡¡}t  ||¡ q| d¡r²t j | |¡}t j | | dd¡¡}t  ||¡ q| d¡rÑt j | |¡}t j | | dd¡¡}t  ||¡ q| d¡rðt j | |¡}t j | | dd ¡¡}t  ||¡ q| d!¡rt j | |¡}t j | | d!d"¡¡}t  ||¡ qd S )#Nz.PDFú.pdfz.DOCXú.docxz.TXTz.txtz.DOCú.docz.XLSXú.xlsxz.CSVú.csvz.TSVú.tsvz.MDú.mdz.PPTXz.pptxz.JSONú.jsonz.HTMz.htmz.HTMLz.htmlz.XMLz.xmlz.PNGú.pngz.JPEGú.jpegz.BMPú.bmpz.JPGú.jpg)ÚosÚlistdirÚendswithÚpathÚjoinÚreplaceÚrename)Úsharepoint_localpathÚfilenameÚold_fileÚnew_file© r@   ú8/var/www/html/AlliBotV5_chatbot/AlliBotV50/index_test.pyÚfile_format_change/   sŽ   







€—rB   c           
      C   sÐ   t | |ƒ}tj|td}t\}}|d }| d¡D ]D}| d¡rFtj || 	¡ dd… ¡}	tj 
|	¡rE|jt|	ƒd|d d	dd
 |d8 }n| d||¡ |d8 }|dk r]| ¡  |d }q| ¡  t|ƒ d S )N)Úpagesizeé(   Ú
z[IMAGE:é   éÿÿÿÿé   éd   éÈ   )ÚwidthÚheightéx   é   )r   r   ÚCanvasr   ÚsplitÚ
startswithr5   r8   r9   ÚstripÚexistsÚ	drawImager   Ú
drawStringÚshowPageÚsaver   )
Ú
input_fileÚtemp_dirÚoutput_fileÚtextÚcrK   rL   ÚyÚlineÚ
image_pathr@   r@   rA   Úconvert_text_and_images_to_pdf›   s&   

€€r`   c                 C   sæ   t ƒ }| ¡  |jddd zt| ddd}| ¡ }W d   ƒ n1 s%w   Y  W n$ tyO   t| ddd}| ¡ }W d   ƒ n1 sHw   Y  Y nw |D ]}|jdd	| ¡ d
dd qRtj	 
| ¡d }| |› d¡ d S )NÚArialé   ©ÚsizeÚrúutf-8©Úencodingzlatin-1rJ   é
   é   ÚC)ÚtxtÚlnÚalignr   z_txt.pdf)r   Úadd_pageÚset_fontÚopenÚ	readlinesÚUnicodeDecodeErrorÚcellrR   r5   r8   ÚsplitextÚoutput)Útxt_pathÚpdfÚfÚlinesr^   Útxt_path_without_extr@   r@   rA   Úconvert_txt_to_pdf±   s$   
ÿ€
ÿ€þr|   c                 C   s‚   t ƒ }| ¡  |jddd tj tj t¡¡d }|jd|dd |j	ddd |  
d	¡}|D ]	}| d
d|¡ q0| |¡ d S )NTrb   )ÚautoÚmarginz/DejaVuSans.ttfÚDejaVu)ÚfnameÚunié   rc   rE   r   ri   )r   ro   Úset_auto_page_breakr5   r8   r9   ÚdirnameÚ__file__Úadd_fontrp   rP   Ú
multi_cellrv   )r[   Úoutput_pathrx   Ú	font_pathrz   r^   r@   r@   rA   Úsave_text_to_pdfÇ   s   
rŠ   c              
   C   s’   z+t ƒ }| | ¡ tj | ¡d }| |› dtj¡ | ¡  t	d| › d|› ƒ W dS  t
yH } zt	d| › d|› ƒ W Y d }~dS d }~ww )	Nr   z	_pptx.pdfúSuccessfully converted ú to TúFailed to convert ú to PDF. Error: F)r   ÚLoadFromFiler5   r8   ru   Ú
SaveToFiler    ÚPDFÚDisposeÚprintÚ	Exception)Ú	pptx_pathÚpresentationÚpdf_pathÚer@   r@   rA   Úpptx_to_pdfÙ   s   
€þr™   c                 C   s6   t | d|dgd}|dksJ ‚td| › d|› ƒ d S )Nrx   ú--pdf-engine=xelatex©Ú
outputfileÚ
extra_argsÚ z
Converted rŒ   )r!   r“   )Úmd_file_pathÚoutput_file_pathrv   r@   r@   rA   Úconvert_md_to_pdfè   s   r¡   c                 C   sÈ   t  | ¡ g }tdƒD ]V}zt|ƒ}t|jƒdkrtdƒ‚| |¡ W q tyF } ztd|› d|› ƒ t	 
d| ¡ W Y d }~qd }~w tya } ztd|› d|› ƒ W Y d }~qd }~ww |S )Nz*.pdfr   z
Empty filezError reading ú: zError reading %sz"An unexpected error occurred with )r5   Úchdirr   r   ÚlenÚpagesr   Úappendr“   ÚloggerÚ	exceptionr”   )r8   ÚpdfsÚfileÚreaderr˜   r@   r@   rA   Úreadfiles_pdfî   s"   
€ €ÿr¬   c                 C   sB   t | dƒ}t | ¡ ¡ d¡W  d   ƒ S 1 sw   Y  d S )NÚrbrf   )rq   Úbase64Ú	b64encodeÚreadÚdecode)r_   Ú
image_filer@   r@   rA   Úencode_imageÿ   s   $ÿr³   c                 C   s²   t  | ¡J}d}| |t j¡ |jdkr/| d¡}|  dd¡}|j|dd |W  d   ƒ S |  d|j ¡ › d	|j ¡ › ¡}| |¡ |W  d   ƒ S 1 sRw   Y  d S )
N)é   r´   ÚBMPÚRGBr3   r2   ÚJPEG)ÚformatÚ.z	.resized.)	r"   rq   Ú	thumbnailÚLANCZOSr¸   r   r:   rW   Úlower)r_   ÚimgÚmax_sizeÚconverted_pathr@   r@   rA   Úresize_and_convert_image  s   

ö$
$ñrÀ   c              
   C   sh   zt | d|dgd td| › d|› ƒ W dS  ty3 } ztd| › d|› ƒ W Y d }~d	S d }~ww )
Nrx   rš   r›   r‹   rŒ   Tr   rŽ   F)r!   r“   r”   )Ú	docx_pathr—   r˜   r@   r@   rA   Úconvert_docx_to_pdf  s   €þrÂ   c                 C   s4   i }|D ]}| | }t |tƒr| ¡ }|||< q|S )zU
    Convert a row to a JSON object with special handling for Timestamp objects.
    )Ú
isinstancer   Ú	isoformat)ÚrowÚcolumn_namesÚdataÚcol_nameÚvaluer@   r@   rA   Úrow_to_json  s   

rÊ   c                 C   s¤   dg i}t  | ¡}t|jƒD ]\}}| ¡ }|d  d|i¡ qW d   ƒ n1 s+w   Y  t|ddd}t||ddd W d   ƒ d S 1 sKw   Y  d S )	Nr¥   r[   Úwrf   rg   Fé   )Úensure_asciiÚindent)Ú
pdfplumberrq   Ú	enumerater¥   Úextract_textr¦   r   )Úinput_pdf_pathÚoutput_json_pathÚpdf_contentrx   Úpage_numÚpager[   Ú	json_filer@   r@   rA   Úpdf_to_json_new+  s   ÿþþ	"ÿrØ   c              	   C   s¢   t | ƒ}| ¡ D ]:\}}t||ƒ}d|gi}|› d| d¡d › d|› d}t|dƒ}	t||	dd	 W d
  ƒ n1 s=w   Y  q| d¡}
|
j| dd d
S )z[
    Convert each row with specific columns in an Excel file to individual JSON files.
    Úcontentú/r,   r   Ú_r0   rË   rÌ   ©rÎ   NF©Úindex)r   ÚiterrowsrÊ   rP   rq   r   ÚheadÚto_excel)r=   rÆ   Úoutput_folderÚbot_file_nameÚdfrÞ   rÅ   Ú	json_dataÚoutput_filenamery   Ú	df_headerr@   r@   rA   Úexcel_to_json=  s   

 ÿ€
rè   c                 C   sZ   t | dƒ}t|ƒ}W d   ƒ n1 sw   Y  tƒ }| |¡ | ¡ }t|dd}|S )Nre   é   rÜ   )rq   r   r   Ú
add_objectÚ	to_schemar   )Ú	file_pathrª   rÇ   ÚbuilderÚschemaÚschema_stringr@   r@   rA   Úgenerate_schema_from_jsonN  s   
þ
rð   c           	      C   s¢   g }t  | ¡D ]G}| d¡rNz"t j | |¡}t|ƒ}t||dd}| ¡ }| |¡ | 	|¡ W q t
yM } zt d| ¡ t|ƒ W Y d }~qd }~ww q|S )Nr0   F)Ú	jq_schemaÚtext_contentzjson conversation failed %s)r5   r6   r7   r8   r9   rð   r   r   Úextendr¦   r”   r§   Úinfor“   )	Údirectory_pathÚsuccess_file_listÚ	documentsr=   rì   rï   ÚloaderÚdocsr˜   r@   r@   rA   Úload_json_directorya  s"   

€þôrú   c                 C   s  t j |¡st  |¡ t  | ¡D ]n}| d¡r~t j | |¡}t j |¡d d }t j ||¡}t||ƒ}|srt	 
d| ¡ zt|||ƒ | |¡ W q tyq }	 zt ¡ \}
}}t	 d| ¡ t	 
d| ¡ W Y d }	~	qd }	~	ww t	 
d| ¡ | |¡ qd S )Nr*   r   r)   z%second docx conversation satisfied %súException  %sz docx to pdf conversion failed %sz$first docx conversation satisfied %s)r5   r8   rS   Úmakedirsr6   r7   r9   ru   rÂ   r§   rô   r`   r¦   r”   ÚsysÚexc_infor¨   )Úinput_directoryÚoutput_directoryrö   rY   r=   Úinput_file_pathÚoutput_file_namer    Úconv_statusÚexÚexc_typeÚ	exc_valueÚexc_tracebackr@   r@   rA   Úbatch_convert_docx_to_pdfv  s.   


€ý
€ïr  c                 C   s6   g }| D ]}t |dƒsi |_||jd< | |¡ q|S )NÚmetadataÚ
session_id)Úhasattrr	  r¦   )r÷   r
  Úupdated_documentsÚdocr@   r@   rA   Úadd_session_id_to_documentsŽ  s   

r  c           L         s(  zwt j t j t¡¡d }t j |¡st d¡ t  |¡ g }d}t  	| ¡}t
| d ƒ}|t
| d ƒ }t
| d ƒ}|t
| d ƒ }t| ƒ t| | ||ƒ z t
| d ƒ}	|	D ]}
t|
|
 d	d
¡ƒ | |
 d¡d ¡ qWW n) ty— } zt ¡ \}}}t d| ¡ t d|
 ¡ d}W Y d }~nd }~ww zt
| d ƒ}|D ]}t|ƒ | | d¡d ¡ q¡W n) tyÜ } zt ¡ \}}}t d| ¡ t d| ¡ d}W Y d }~nd }~ww t
| d ƒ}|D ]!}t|ƒ}|süt d| ¡ t d| ¡ qå| | d¡d ¡ qåt
| d ƒ}|D ]C}zt|| dd¡ƒ | | d¡d ¡ W q tyR } zt ¡ \}}}t d| ¡ t d| ¡ d}W Y d }~qd }~ww ttdƒddd}t
| d ƒt
| d ƒ t
| d ƒ t
| d  ƒ }g d!¢}g }|D ]}| t
t j | d"|› ¡ƒ¡ q||D ]“}t|ƒ}t|ƒ}| d#¡s§| d$¡rªd%}n| d&¡r³d'}nd%}zN| td(d)t d*d(d+œd,d-d.|› d/|› id0œgd)g¡}t j !t j "|¡¡d d1 | d2¡d  d3 }t j | |¡}t#|j$|ƒ | | d¡d ¡ W q‘ ty$ } zt d4|› d5|› ¡ d}W Y d }~q‘d }~ww t j t j t¡¡d6 } t%| ƒ}!t j&| d7d8 g ‰ |!D ]Í}"t j | |"¡}t'|ƒ}#d9g i}$z_t(t)|#j*ƒƒD ]}%|#j*|% }&|& +¡ }'|$d9  d*|'i¡ qYt,|$d:d;}(|" d3d<¡})t j | |)¡}*t-|*d=ƒ}+|+ .|(¡ W d   ƒ n	1 s˜w   Y  t d>|" ¡ ˆ  |" d2¡d ¡ W q@ ty } zPz%|" d3d<¡})t j | |)¡},t/||,ƒ t d?|" ¡ ˆ  |" d2¡d ¡ W n# ty } zt d@|" ¡ t dA| ¡ d}W Y d }~nd }~ww W Y d }~q@d }~ww t0t
| dB ƒƒD ],}-t1|-dCdDdE}.|. 2¡ }/|/rAt3|/|ƒ}0tdƒt j4dF< t5ƒ }1t6j7|/t8||1t6 9¡ dG}2qt dH¡ t
| dI ƒ}3|3D ]K}
zt:|
dJdK}4|4j;|
 dLdM¡dDdN | |
 d¡d ¡ W qP ty› } zt ¡ \}}}t d| ¡ t dO|
 ¡ d}W Y d }~qPd }~ww t
| dP ƒ}5|5D ]L}
zt:|
dQdJdR}4|4j;|
 dSdT¡dDdN | |
 d¡d ¡ W q¤ tyð } zt ¡ \}}}t d| ¡ t dO|
 ¡ d}W Y d }~q¤d }~ww t j t j t¡¡dU }6t j |6¡s	t  |6¡ t dV¡ t
| d ƒ}7g ‰|7D ]L}8zt<|8ƒ}9|9j= >¡ }:|8 d¡d };t?|8|:|6|;ƒ ˆ |;¡ W q tyd } zt ¡ \}}}t d| ¡ t dO|8 ¡ d}W Y d }~qd }~ww t0t
|6dB ƒƒD ][}-zt1|-dWdDdE}.|. 2¡ }<W n, ty¨ } zt ¡ \}}}t d| ¡ t dO|- ¡ d}g }<W Y d }~nd }~ww |<rÇt3|<|ƒ}=tdƒt j4dF< t5ƒ }1t6j7|<t8||1t6 9¡ dG}2qmt dX¡ z%t@| |ƒ}>|>ròt3|>|ƒ}?tdƒt j4dF< t5ƒ }1t6j7|>t8||1t6 9¡ dG}2W n ty }@ zt |@¡ d}W Y d }@~@nd }@~@ww t
| dY ƒ}Ag }B|AD ]Y}-z6tA|-ƒ}.|B |. 2¡ ¡ | |- d¡d ¡ |BrNt3|B|ƒ}Ctdƒt j4dF< t5ƒ }1t6j7|Bt8||1t6 9¡ dG}2W q tyq }@ zd}t dZ|-› d[|@› ¡ W Y d }@~@qd }@~@ww t
| d\ ƒ}Dg }E|DD ]Y}-z6tA|-ƒ}.|E |. 2¡ ¡ | |- d¡d ¡ |Er²t3|E|ƒ}Ftdƒt j4dF< t5ƒ }1t6j7|Et8||1t6 9¡ dG}2W q| tyÕ }@ zt dZ|-› d[|@› ¡ d}W Y d }@~@q|d }@~@ww t
| d] ƒ}Gg }H|GD ]Y}-z6tB|-ƒ}.|H |. 2¡ ¡ | |- d¡d ¡ |Hrt3|H|ƒ}Itdƒt j4dF< t5ƒ }1t6j7|Ht8||1t6 9¡ dG}2W qà ty9 }@ zd}t dZ|-› d[|@› ¡ W Y d }@~@qàd }@~@ww t d^¡ ‡ fd_d`„|D ƒ}J‡fdad`„|D ƒ}K||J |K }t db| ¡ t)|ƒt)|ƒkrid}tC| ƒ tC|6ƒ t dc¡ |W S  ty“ } zt dA| ¡ W Y d }~dS d }~ww )dNz/temp_imagesztemp img path doesn't existrj   z/*.pdfz/*.PDFz/*.xlsxz/*.XLSXz/*.docr+   Ú_docrÚ   rG   rû   zdoc to pdf conversion failed %sr   z/*.txtztxt to pdf conversion failed %sz/*.pptxzException occured at %sz pptx to pdf conversion failed %sz/*.mdr/   z_md.pdfzmd to pdf conversion failed %sr$   zgpt-4or´   )Úopenai_api_keyÚmodelÚ
max_tokensz/*.jpgz/*.jpegz/*.pngz/*.bmp)r4   r2   r1   r3   Ú*r2   r4   z
image/jpegr1   z	image/pngz?You are a useful bot that is especially good at OCR from images)rÙ   r[   )Útyper[   Ú	image_urlÚurlzdata:z;base64,)r  r  rÛ   r¹   r)   zFailed to process image r¢   z/pdf_json_convT)Úexist_okr¥   rÌ   rÜ   r0   rË   zpypdf satisfied %szpdfplumber satisfied %szexception occured pdf %szException occured due to %sz/*.jsonz.pages[]F)rì   rñ   rò   r%   )r÷   Úes_connectionÚ
index_nameÚ	embeddingÚstrategyzpdf index addedz/*.csvÚlatin1rg   r-   z	_csv.xlsxrÝ   z csv to xlsx conversion failed %sz/*.tsvú	)Úseprh   r.   z	_tsv.xlsxz/excel_json_convzentered excel conversion partz
.content[]zexcel index addedz/*.htmlzException occured in z and the error is z/*.htmz/*.xmlzfiles are indexedc                    s6   g | ]}|  d ¡d   d¡d ˆ v r|  d ¡d ‘qS )rÚ   rG   r¹   r   ©rP   ©Ú.0Úi)Úpdf_successr@   rA   Ú
<listcomp>  s   6 z"start_indexing.<locals>.<listcomp>c                    s,   g | ]}|  d ¡d ˆ v r|  d ¡d ‘qS )rÚ   rG   r  r   )Ú
xl_successr@   rA   r$    s   , zsuccess_file_list %szfiles removed)Dr5   r8   r9   r„   r…   Úisdirr§   rô   Úmkdirr6   r   rB   r  r   r:   r¦   rP   r”   rý   rþ   r¨   r|   r™   r¡   r   r   ró   rÀ   r³   r7   Úinvoker	   r   ru   ÚbasenamerŠ   rÙ   r¬   rü   r   Úranger¤   r¥   rÑ   r   rq   ÚwriterØ   r   r   r   r  Úenvironr   r   Úfrom_documentsÚesÚExactRetrievalStrategyr   rá   r   ÚcolumnsÚtolistrè   rú   r
   r   r   )Lr<   Ú	indexnamer
  rY   rö   Ústatus_codeÚorg_filelistÚorg_pdffilesÚorg_excelfilesÚ	doc_filesÚfier  r  r  r  Ú	txt_filesrª   Ú
pptx_filesÚ	pptx_convÚmd_filesÚchainÚimage_filesÚsupported_formatsÚextr²   Úprocessed_image_pathÚimageÚ	mime_typeÚmsgÚpdf_filenamer—   r˜   Úpdf_json_pathr©   Úpdf_filer«   Úpdf_datarÕ   rÖ   r[   Úpdf_jsonÚjson_filenameÚ	json_pathr×   rÓ   r"  rø   Újson_documentsÚjson_documents_with_session_idÚ
embeddingsÚdbÚcsv_filerä   Útsv_fileÚbot_json_pathÚ
xlsx_filesrì   Ú	dataframer0  rã   Úexcel_documentsÚexcel_documents_with_session_idÚJSON_othersÚ#only_json_documents_with_session_idÚerrorÚ	html_fileÚhtml_loaded_documentsÚhtml_documents_with_session_idÚhtm_fileÚhtm_loaded_documentsÚhtm_documents_with_session_idÚxml_fileÚxml_loaded_documentsÚxml_documents_with_session_idÚpdf_org_succÚex_org_succr@   )r#  r%  rA   Ústart_indexing˜  sŒ  


þ€üþ€ü€ü
0"ÿÿþþÿü.€þ
ÿ
€ý€ù
û€
€ü€ü	


€ü€û
û€


û€
€þ
û€$€ü

û€€þ
û€$€þ

€þre  )]r5   rý   Úpandasr   r   r   r   Úgensonr   Úlangchain.chat_modelsr   Úlangchain.schema.messagesr   r	   Ú$langchain_community.document_loadersr
   r   Ú langchain_community.vectorstoresr   Úlangchain.text_splitterr   Úlangchain_openair   Úshutilr   Úlangchain.document_loadersr   Úelasticsearchr   ÚPyPDF2r   Únatsortr   Újsonr   r   r   Údecoupler   Údoc2pdfr   ÚPyPDF2.errorsr   rÏ   Údocx2txtr   Úreportlab.lib.pagesizesr   Úreportlab.pdfgenr   Úreportlab.lib.utilsr   Úfpdfr   Úspire.presentationr   r    Úpypandocr!   r®   ÚPILr"   r8   r¦   r9   r„   r…   Úattach_ind_logr#   ÚgetRootLoggerr§   r,  r.  rN  rô   rB   r`   r|   rŠ   r™   r¡   r¬   r³   rÀ   rÂ   rÊ   rØ   rè   rð   rú   r  r  re  r@   r@   r@   rA   Ú<module>   sp     
l	
