o
    phH)                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZ d dlZdd	 Zd
d Zd(ddZdd Zdd Zdd Zdd Zdd Zdd Zdd Zd)ddZ	 d d! Zed"krd#Z d$e  Z!d%Z"d&Z#d'Z$ee e!e#e$ dS dS )*    N)OllamaEmbeddings)FAISS)PyPDFLoader
TextLoaderDocx2txtLoader)RecursiveCharacterTextSplitter)Presentation)SemanticChunkerc                 C   s$   |  dd}tdddd| g |S )N.ppt.pptxlibreofficez
--headlessz--convert-topptx)replace
subprocessrun)	file_pathpptx_file_path r   @/var/www/eduai.edurigo.com/question_generate/staging/training.pyconvert_ppt_to_pptx   s   r   c                 C   s\  t j| d  }|dkrt| }| }n_|dkr$t| }| }nR|dv r4t| 	d}|g}nB|dkr\t
| }g }|jD ]}|jD ]}t|drS||j qFqAd|g}n|d	krot| }t|}t | |S td
| |s~td g S tddd}	g }
|D ]#}t|tr|}nt|dr|j}ntd| q|	|}|
| q|
S )N   z.pdf.txt)z.docxz.docutf-8r   text
r
   zUnsupported file type: zNo documents loaded from file.i  2   )
chunk_sizechunk_overlappage_contentUnsupported document format:)ospathsplitextlowerr   loadr   textractprocessdecoder   slidesshapeshasattrappendr   joinr   load_and_split_documentremove
ValueErrorprintr   
isinstancestrr   
split_textextend)r   file_extensionloaderdocsr   prsslideshaper   text_splitterfinal_documentsdocchunksr   r   r   r-      sR   









r-   nomic-embed-textc                 C   sb   t |d}g }| D ]}t|tr|| q	t|dr"||j q	td| q	tj||d}|S )Nmodelr   r   )texts	embedding)	r   r1   r2   r+   r*   r   r0   r   
from_texts)	documentsrA   
embeddingsrB   r=   vectorsr   r   r   create_embeddingsC   s   


rH   c                 C      t dd | D S )Nc                 s   s    | ]	}t | V  qd S )Nlensplit.0r=   r   r   r   	<genexpr>S   s    zcount_tokens.<locals>.<genexpr>sum)rE   r   r   r   count_tokensR   s   rR   c                 C   s   t | }| S )zLoad PDF and return documents.)r   r$   )r   r6   r   r   r   load_pdf[   s   rS   c                 C   s(   d}| D ]}|j  }|t|7 }q|S )z+Counts total tokens in the given documents.r   )r   rL   rK   )rE   total_tokensdocumenttokensr   r   r   count_tokens_in_documentsa   s
   
rW   c                 C   rI   )z1Count the total number of words in the documents.c                 s   s     | ]}t |d   V  qdS )r   NrJ   rM   r   r   r   rO   l   s    z$count_total_words.<locals>.<genexpr>rP   )r7   r   r   r   count_total_wordsj   s   rX   c                 C   s,   t |dd}|dd | D }td |S )z@Splits the text into semantic chunks using the given embeddings.
percentile)breakpoint_threshold_typec                 S   s   g | ]}|j qS r   )r   rM   r   r   r   
<listcomp>s   s    z4split_text_with_semantic_chunker.<locals>.<listcomp>z%Documents split into semantic chunks.)r	   create_documentsr0   )r7   rF   r;   rE   r   r   r    split_text_with_semantic_chunkern   s   r]   c              	   C   s   t j|st | t| D ]7\}}d|d  d}t j||}t|ddd}||j W d   n1 s:w   Y  t	d|  qdS )	zBSaves each document in the documents list as a separate .txt file.document_part_r   r   wr   )encodingNzSaved: )
r    r!   existsmakedirs	enumerater,   openwriter   r0   )rE   
output_dirirU   	file_namer   filer   r   r   save_documents_to_txtx   s   
rj   c           
      C   s   t |}t |}tjd||}tj|dd tdd}t| ddD ]5\}}tj|}tj|dd tj|g|d}tj|d	| }	|	|	 t
d
| d| d|  q"d S )Nmy_embeddingsT)exist_okr?   r@   r   )start)rC   faiss_indexz(Saved FAISS embedding for document part z as faiss_indexz in )r2   r    r!   r,   rb   r   rc   r   from_documents
save_localr0   )
split_documents	client_idreference_idembedding_folder_baserF   idxr=   embedding_foldertemp_dbembedding_file_pathr   r   r   create_and_save_embeddings   s   

ry   rk   c                    sZ  t dd}d }| d|  d| }dd t|D }t|dd d}|D ]0}tj||}	td	|	  tj|	|d
d  fdd j	
 D }
|d u rP }q%||
 q%|d urb|| d t| |D ]B}tj||}	zt|	 td|	  W qh ty   td|	  Y qh ty } ztd|	 d|  W Y d }~qhd }~ww |S )Nr?   r@   /c                 S   s.   g | ]}| d r|td d  r|qS )rn   N)
startswithrK   isdigit)rN   folderr   r   r   r[      s    z#merge_all_faiss.<locals>.<listcomp>c                 S   s   t | ddS )Nrn    )intr   )xr   r   r   <lambda>   s    z!merge_all_faiss.<locals>.<lambda>)keyzLoading FAISS index from: T)allow_dangerous_deserializationc                    s   g | ]	} j |jqS r   )docstoresearchr   )rN   doc_idcurrent_faissr   r   r[      s    z/merged_faisszDeleted FAISS index folder: zFolder not found: zError deleting z: )r   r    listdirsortedr!   r,   r0   r   
load_localindex_to_docstore_idvalues	add_textsrp   shutilrmtreeFileNotFoundErrorOSError)rr   rs   	base_pathrF   merged_faissfolder_pathfaiss_folderssorted_foldersr}   
faiss_pathcurrent_textser   r   r   merge_all_faiss   s@   


 r   c                 C   sp   t   }td|  t| }t|}tdd}t||}t|| t||| d| d| }		 t||}
d S )NzStart Time: r?   r@   zmy_embeddings/rz   )	timer0   rS   rW   r   r]   rj   ry   r   )r   rf   rr   rs   
start_timer7   Total_countrF   rq   embedding_dirmerge_embeddingsr   r   r   main   s   


r   __main__z	cyber.pdfztemp/output_embeddingsi  iy  )r?   )rk   )%r   r    langchain_community.embeddingsr    langchain_community.vectorstoresr   $langchain_community.document_loadersr   r   r   langchain.text_splitterr   r   r   r%   r   $langchain_experimental.text_splitterr	   r   r   r-   rH   rR   rS   rW   rX   r]   rj   ry   r   r   __name__r   rf   rt   rr   rs   r   r   r   r   <module>   s@    
0		

4#
