o
    oh                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlZd dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl-m3Z3m0Z0 d dlm4Z4 d dl5Z5d dl6m7Z7 d dlm8Z8 d dlm9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZC d dlmZ dZDdZEdZFG d d! d!e+ZGG d"d# d#e+ZHG d$d% d%e+ZIG d&d' d'e+ZJG d(d) d)e+ZKG d*d+ d+eZLd,d- ZMd.d/ ZNd0d1 ZOd2d3 ZPd4d5 ZQd6d7 ZRd~d9d:ZSdd<d=ZTd>d? ZUd@dA ZVdBdC ZWdDdE ZXdFdG ZYdHdI ZZdJdK Z[dLdM Z\dNdO Z]dPdQ Z^dRdS Z_dTdU Z`d dlaZaddWdXZbdYdZ Zcdd\d]Zddd^d_Zed d`lfmgZg dadb Zhdcdd ZidedG ZYdfdg Zjdhdi ZkddkdlZlG dmdn dnZmdoenfdpdqZodrds Zpdtdu ZqerdvkrdwZsdxes ZtdyZudzZvd{Zwd|Zxd}Zyd|Zzd8Z{d8Z|eqeseweyexeze{e| dS dS )    N)validateValidationError)	AIMessage)OllamaEmbeddings)FAISS)ChatPromptTemplate)AsyncWebCrawler)OutputFixingParser)ChatGroq)PromptTemplate)StrOutputParserPydanticOutputParser)RunnablePassthrough)PyPDFLoader
TextLoaderDocx2txtLoaderUnstructuredPowerPointLoader)	PdfReader)SemanticChunker)	BaseModelField)ListOptionalDict)urlparse)Unionr   )RunnableLambda)Presentation)	OllamaLLM)
ChatOllama)PdfConverter)create_model_dict)text_from_rendered)Path)MarkdownHeaderTextSplitter8gsk_CEh3itIpUAkEkEKsUDqVWGdyb3FYoTjqmXNTBHOSxJFK3obGTzXZnomic-embed-textz"44622834-f22df6f12cf45558ee180dd8dc                   @   sz   e Zd ZU edZeed< edddZee ed< edddZ	e
e ed	< edd
dZeed< edddZee ed< dS )SlideContentflashtypeNz$An optional subheading for the slidedescription
subheading.z(List of paragraphs for the slide content
paragraphszUA specific and concise suggestion for a relevant visualization or image (max 5 words)visualization_suggestionzURL of the image for the slideimage)__name__
__module____qualname__r   r)   str__annotations__r,   r   r-   r   r.   r/    r5   r5   I/var/www/eduai.edurigo.com/storigo/production/generate_storigo_content.pyr'   /   s   
 r'   c                   @   s^   e Zd ZU edZeed< edddZeed< edddZe	e ed< edd	dZ
eed
< dS )
MCQContentQuestionr)   .zThe multiple-choice questionr*   questionzA list of 4 answer optionsoptionsz0The correct answer (e.g., 'a', 'b', 'c', or 'd')correct_answerN)r0   r1   r2   r   r)   r3   r4   r9   r:   r   r;   r5   r5   r5   r6   r7   8   s
   
 r7   c                   @   s>   e Zd ZU edddZeeef ed< edddZ	e
ed< dS )StorigoContent.7Dictionary of slide contents with slide numbers as keysr*   slides/Total token count for all the generated contenttoken_countN)r0   r1   r2   r   r>   r   r3   r'   r4   r@   intr5   r5   r5   r6   r<   >   s   
 r<   c                   @   sZ   e Zd ZU edddZeeef ed< edddZ	eee
f ed< edddZeed< d	S )
StorigoContentMCQ.r=   r*   r>   z8Dictionary of MCQs with identifiers like 'mcq_1' as keysmcqsr?   r@   N)r0   r1   r2   r   r>   r   r3   r'   r4   rC   r7   r@   rA   r5   r5   r5   r6   rB   E   s   
 rB   c                   @   sF   e Zd ZU edddZeeeee	f f e
d< edddZee
d< dS )StorigoContentMCQMid.zYDictionary of slide contents with slide numbers as keys and MCQs with MCQ numbers as keysr*   r>   r?   r@   N)r0   r1   r2   r   r>   r   r3   r   r'   r7   r4   r@   rA   r5   r5   r5   r6   rD   L   s   
 $rD   c                   @   s   e Zd Zdd ZdS )CustomMCQParserc              
   C   s   t |tr
d|}|dd}|dd }zt|}| j|W S  tj	y: } z	t
dt| d }~w t
yN } z	t
dt| d }~ww )N '"zHere is the MCQ: zError decoding JSON: zError parsing result: )
isinstancelistjoinreplacestripjsonloadspydantic_objectmodel_validateJSONDecodeError	Exceptionr3   )selfresultjson_objecter5   r5   r6   parse_resultQ   s   


zCustomMCQParser.parse_resultN)r0   r1   r2   rY   r5   r5   r5   r6   rE   P   s    rE   c              
   C   sV   zt | }d}|jD ]}|| 7 }q
|W S  ty* } z	tdt| d }~ww )NrI   z Error extracting text from PDF: )r   pagesextract_textrT   r3   )pdf_pathreadertextpagerX   r5   r5   r6   extract_text_from_pdfj   s   
r`   c                       t |}|  dd| }td| }tg d  fdd|D }|   fdd|D  }|d tdt| }d|S )NrF   \w+andortheaaninonattoforofwithbyc                       g | ]}| vr|qS r5   r5   .0wordcommon_wordsr5   r6   
<listcomp>       z)generate_search_query.<locals>.<listcomp>c                    rq   r5   r5   rr   suggestion_wordsr5   r6   rw      rx      )	extract_context_keywordsrL   refindalllowersetsplitminlenr.   slide_contentcontext_keywordscombined_querywordsfiltered_wordsprioritized_wordsquery_wordsr5   rv   rz   r6   generate_search_queryt      
r   c                    ra   )NrF   rb   rc   c                    rq   r5   r5   rr   ru   r5   r6   rw      rx   z-generate_search_query_new.<locals>.<listcomp>c                    rq   r5   r5   rr   ry   r5   r6   rw      rx   r{   )	extract_context_keywords_newrL   r}   r~   r   r   r   r   r   r   r5   r   r6   generate_search_query_new   r   r   c                    s^   | j pd dd| j }td| }tg d  fdd|D }tt|d d S )NrI   rF   rb   rc   c                    rq   r5   r5   rr   ru   r5   r6   rw      rx   z,extract_context_keywords.<locals>.<listcomp>   )r,   rL   r-   r}   r~   r   r   rK   r   r^   r   keywordsr5   ru   r6   r|      s
   r|   c                    sf   |  dd dd|  dg  }td| }tg d  fdd|D }tt|d d	 S )
Nr,   rI   rF   r-   rb   rc   c                    rq   r5   r5   rr   ru   r5   r6   rw      rx   z0extract_context_keywords_new.<locals>.<listcomp>r   )getrL   r}   r~   r   r   rK   r   r5   ru   r6   r      s
   $r   c              
   C   s   d}t | dddddd}z/tj||d}|  | }|d	 r2t|d	 d
d dd}|d d W S td|   W d S  tjyX } ztdt|  W Y d }~d S d }~w t	ys } ztdt|  W Y d }~d S d }~ww )Nzhttps://pixabay.com/api/photo
horizontal   true	relevance)keyq
image_typeorientationper_page
safesearchorder)paramshitsc                 S   s   | d | d  S )Nlikes	downloadsr5   xr5   r5   r6   <lambda>       z%fetch_pixabay_image.<locals>.<lambda>Tr   reverser   webformatURLzNo image found for query: z#Error fetching image from Pixabay: z)Unexpected error in fetch_pixabay_image: )
PIXABAY_API_KEYrequestsr   raise_for_statusrO   sortedprintRequestExceptionr3   rT   )queryurlr   responsedatasorted_hitsrX   r5   r5   r6   fetch_pixabay_image   s6   
r   r   c                 C      | st d d S t|D ]S}z1t| |}t d|d  d|  t|}|r2t d|  |W   S t d|  td W q ty_ } zt d|d  dt|  W Y d }~qd }~ww t d	| d
 d S Nz%No visualization suggestion provided.zAttempt r   z to fetch image for query: zValid image found: z!No image URL returned for query: z"Error in get_valid_image (attempt z): zNo valid image found after z	 attempts)r   ranger   r   timesleeprT   r3   r.   r   max_attemptsattemptr   	image_urlrX   r5   r5   r6   get_valid_image   &   

(r   r   c                 C   r   r   )r   r   r   r   r   r   rT   r3   r   r5   r5   r6   get_valid_image_new   r   r   c                 C   s   t d| }t|S )Nrb   )r}   r~   r   )r^   tokensr5   r5   r6   count_tokens   s   r   c           =         s   z|dkrt dtd}ntddd}t| jj  td t  t }	dWd	d
t	fdd D }
td|
  g }|	kr]td t
jd|	d td} fdd|D }notd dd  D }t	|fdd|D }dd |D }t	| }d}|dkr|dkr||  d7  < |d8 }n|dk r|| dkr||  d8  < |d7 }|d |	 }|dkst |D ]\}}||g|  qddd |d  D td t d}ttd}tj||dt|}fdddd fd dd!|B |B B }|d"d#}d$tfd%d&tt|j fd'dd(}t|tr4|dk}nt|trA| d)v }nd*}|r{| D ]/\}}|jrmt|j|}|r`||_ qJtd+| d, d |_ qJtd-| d. d |_ qJn| D ]}d |_ qd}| D ]}|j! d/d/|j" d/|j } |t#| 7 }q|rd0}!d1d2 }"t|!}#tddd }}i }$t|$ }%t%dt|%d3D ]}t|$|k rPg }&t%|t&|d3 t|%D ]}'||%|'  }|&'|j! d4d/|j"  qd5|&td6 t z)t(fd7d|#B |B i }(td8 t|( td9 td: t)|(d;r5|(j*})nt|(})|)+d<}*|*d=krd}+d=},t,|)|*d  |*d>D ]"\}}-|-d<kr`|+d7 }+qQ|-d?krr|+d8 }+|+dkrr|}, nqQ|,d=kr|)|*|,d  }.t|. t-.|.}/n	td@ ntdA tdB t)|(d;r|(j*n|(}0tdC t|0 tdD t|. tdE |.r|. st/dFzN|/0dGd"|/0dHg |/0dId"dJ}1tdK tt-j1|1dLdM |1dG rt|1dH dLkr|1dI rdNt|$d  }2|1|$|2< tdO|2 dP nt/dQW n  t-j2y1 }3 ztdR|3  tdS|0 W Y d }3~3nd }3~3ww W q t3yO }3 ztdT|3  W Y d }3~3qd }3~3ww q|$ D ]}4|4dG  d/d/|4dH  d/|4dI  } |t#| 7 }qVi }5d}6}7|}8|dkr|8dkr|7|8 n|7}9t,|%D ].\}:}|| |5|< |:d |9 dkr|6|8k rdN|6d  }2t|2 |$|2 |5|2< |6d7 }6qt4|5|dU};|;W S |%D ]	}|| |5|< qt%|8D ]}6dN|6d  }2|$|2 |5|2< qt4|5|dU};|;W S t||dU}<|<W S  t3y }3 z	t3dVt|3 d }3~3ww )XNr   llama3-70b-8192
model_namegroq_api_keyhttp://127.0.0.1:11434
gemma3:12bbase_urlmodelYOYO2   c                 S   s   t dtt|  | S Nr   )maxroundr   r   )r^   words_per_slider5   r5   r6   estimate_slides  s   z/generate_slide_content.<locals>.estimate_slidesc                 3   s    | ]} |j V  qd S Npage_contentrs   doc)r   r5   r6   	<genexpr>  s    z)generate_slide_content.<locals>.<genexpr>u.   🧠 Estimated possible slides from document: u>   📊 Scenario 1: Fewer slides than chunks — sampling evenly.r   )numdtypec                    s   g | ]} | qS r5   r5   )rs   i)
all_chunksr5   r6   rw   %      z*generate_slide_content.<locals>.<listcomp>uI   📊 Scenario 2: More slides than chunks — distributing proportionally.c                 S   s   g | ]	}t |j qS r5   )r   r   r   )rs   cr5   r5   r6   rw   (      c                    s   g | ]}|   qS r5   r5   )rs   w)
num_slidestotal_wordsr5   r6   rw   *  rx   c                 S   s   g | ]	}t d t|qS r   )r   r   )rs   r   r5   r5   r6   rw   +  r   z

c                 s   s    | ]}|j V  qd S r   r   r   r5   r5   r6   r   =  s    zContext text is u=  
Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

STRICT RULES:
- ❗ ONLY use the content provided in the 'context' below.
- ❌ DO NOT introduce any external knowledge, definitions, or examples not present in the context.
- ⚠️ Do not assume common sense or use general facts. Stick to the exact information given.
- ⚠️ Avoid generic phrases like “as we know”, “in general”, or “in this video”.

Each slide must include:
- A clear and concise **sub-heading**
- **Exactly 2–4 concise paragraphs** derived solely from the context
- A **visualization suggestion** (max 5 words, specific to the content)

Important: Only output the final JSON object. No additional text, markdown, or explanation should be included.

Context:
{context}

{format_instructions}

The final output must be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}".
Each slide must contain:
- "subheading"
- "paragraphs"
- "visualization_suggestion"
rQ   )parserllmc                    s    S r   r5   r   context_textr5   r6   r   c  s    z(generate_slide_content.<locals>.<lambda>c                 S      | d S Nr   r5   r   r5   r5   r6   r   d      c                          S r   get_format_instructionsr   r   r5   r6   r   e  r   )contextr   format_instructionsrI   r   r   item_keyc                 S   L   |  dd\}}|dkrd}n	|dkrd}nd}| r t|nd}||fS )z
            Sort keys so that:
            - 'slide_1', 'slide_2', ..., 'slide_10' are in numeric order
            - 'mcq_1', 'mcq_2', ... come after slides in numeric order
            _r   slider   mcq  r   isdigitrA   r   prefixnum_strgroupnumberr5   r5   r6   custom_slide_sort_keyp  s   z5generate_slide_content.<locals>.custom_slide_sort_keyc                        | d S Nr   r5   kvr  r5   r6   r         r   )1r   yes0F+Warning: No suitable image found for slide  after multiple attempts./Warning: No visualization suggestion for slide .rF   us  
Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

**Context**: {context}

The MCQ must include:
- A **question** related to the context
- Exactly **4 answer options**
- A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

⚠️ **Critical Requirements**:
- ✅ Return **only valid JSON** — no explanations, headers, or extra text.
- ✅ Ensure all fields and options are enclosed in **double quotes (`"`)**.
- ✅ Do **not** use letters like "A.", "B." in the options — just the plain text.
- Dont give Here is the MCQ while generating MCQ
- Directly follow the format given below

The final output **must strictly follow** this format:
```json
{{
    "question": "<The MCQ question>",
    "options": [
        "<Option 1>",
        "<Option 2>",
        "<Option 3>",
        "<Option 4>"
    ],
    "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"

    Always give "question","options","correct_answer" these labels in double quotes only
}}

c                 S   s(   zt |  W dS  t jy   Y dS w )NTF)rO   rP   rS   )r   r5   r5   r6   is_valid_json  s   
z-generate_slide_content.<locals>.is_valid_json   : 
Contextc                    s   d iS )Nr   r5   r   r   r5   r6   r     r   qwerENDtokens1content{start}z No matching closing brace found.zNo opening brace found.matchmcq_contentcontent_onlycontent_only12z%Empty or invalid response from Ollamar9   r:   r;   )r9   r:   r;   zFormatted MCQ:   )indentmcq_u   ✅ MCQ  generated and saved!zIncomplete MCQ datazJSON Decode Error: zRaw Response:Error generating MCQ: r>   r@    Error generating slide content: )r   )5r
   GROQ_API_KEYr   rK   docstore_dictvaluesr   r   sumnplinspacerA   zipextendrL   r   r<   r	   from_llmr   from_templateinvoker3   dictr   r>   itemsrJ   rN   r.   r   r/   r,   r-   r   keysr   r   appendr   hasattrr  find	enumeraterO   rP   
ValueErrorr   dumpsrS   rT   rD   )=vectors	client_idr   num_mcqsis_imageis_questionquestion_positionGPUr   total_chunkstotal_possibleselected_chunksindicesword_counts	raw_allocallocdiffr   chunkcountslide_content_template
raw_parserslide_content_promptslide_content_chainrV   ordered_slidesis_image_bool	slide_keyr   r   r@   r   text_contentmcq_templater  
mcq_promptrC   
slide_keyscontext_slidesj
mcq_resultr  r  open_bracesendcharr"  rW   r!  formatted_mcqmcq_keyrX   r   interleaved_contentmcq_countertotal_slides
total_mcqsintervalidxstorigo_contentstorigo_content_without_mcr5   )r   r   r  r   r   r   r   r6   generate_slide_content  s  











 !

"










&Z&
rl  c                 C      t | }| S zLoad PDF and return documents.)r   load	file_pathloaderr5   r5   r6   load_pdf[     rs  c                 C   s   t dd | D S )z1Count the total number of words in the documents.c                 s   s     | ]}t |d   V  qdS r   N)r   r   r   r5   r5   r6   r   c  s    z$count_total_words.<locals>.<genexpr>)r/  )docsr5   r5   r6   count_total_wordsa  s   rw  c                 C   ,   t |dd}|dd | D }td |S )@Splits the text into semantic chunks using the given embeddings.
percentilebreakpoint_threshold_typec                 S      g | ]}|j qS r5   r   r   r5   r5   r6   rw   j      4split_text_with_semantic_chunker.<locals>.<listcomp>%Documents split into semantic chunks.r   create_documentsr   rv  
embeddingstext_splitter	documentsr5   r5   r6    split_text_with_semantic_chunkere  s   r  c              	      s   t dd4 I d H U}|j| dI d H }t|j t| }|j|jdd }|ds0|d7 }t	|d}|
|j W d    n1 sFw   Y  td t| |W  d   I d H  S 1 I d H sew   Y  d S )	NT)verbose)r   /r   .txtr   z	HELLO jII)r   arunr   markdownr   netlocpathrM   endswithopenwrite)filecrawlerrV   
parsed_urlfilenametxt_filer5   r5   r6   	crawlerrro  s   

0r  c              
   C   s   z:t d | rtj| st d|  d W dS t| ddd}| }W d    n1 s.w   Y  |r8|W S dW S  tyT } zt d|  W Y d }~dS d }~ww )	NStartedzError: The file 'z*' does not exist or the path is incorrect.rI   rutf-8encodingzError reading the file: )r   osr  isfiler  readrT   )inputr  r  rX   r5   r5   r6   read_file_url  s   
r  c                 C   sR   d}t ddd}tdg|d}||B }|d| i}t| |j}tt| |S )Nak  
    Extract only the meaningful content from the text below. Focus on descriptions, value propositions, mission statements,
    features, and anything that provides valuable information about the company, products, or services. Ignore any URLs,
    navigation links, contact forms, or irrelevant sections.

    Here is the content to process:

    {context}
    r   r   r   r   )input_variablestemplate)r   r   r6  r   r  r)   )r  prompt_templater   promptrunnablefiltered_contentr5   r5   r6   clean_using_llm  s   
r  c                 C   s   t |dd}t| tr| g} tdt|   td| r| d nd  t| d tr2dd | D } td	d
 | D sAtd g S |dd | D }td t| |S )ry  rz  r{  zType of docs after conversion: zFirst item in docs: r   z
Empty listc                 S   s   g | ]}d |iqS r   r5   r   r5   r5   r6   rw     r   z<split_text_with_semantic_chunker_for_url.<locals>.<listcomp>c                 s   s"    | ]}t |tod |v V  qdS ru  )rJ   r7  r   r5   r5   r6   r     s     z;split_text_with_semantic_chunker_for_url.<locals>.<genexpr>z"Error: Invalid document structure.c                 S   s   g | ]}|d  qS r   r5   r   r5   r5   r6   rw     r   r  )r   rJ   r3   r   r)   allr  r  r5   r5   r6   (split_text_with_semantic_chunker_for_url  s    
r  c              	   C   s   t j|st | t| D ]7\}}d|d  d}t j||}t|ddd}||j W d   n1 s:w   Y  t	d|  qdS )	zBSaves each document in the documents list as a separate .txt file.document_part_r   r  r   r  r  NzSaved: )
r  r  existsmakedirsr=  rL   r  r  r   r   )r  
output_dirr   document	file_namerq  r  r5   r5   r6   save_documents_to_txt  s   
r  c           	      C      t |}tjd|}tj|dd tdd}t| ddD ]5\}}tj|}tj|dd tj|g|d}tj|d	| }|	| t
d
| d| d|  qd S Nmy_embeddingsTexist_okr&   r   r   r  	embeddingfaiss_indexz(Saved FAISS embedding for document part z as faiss_indexz in r3   r  r  rL   r  r   r=  r   from_documents
save_localr   	split_documentsrA  embedding_folder_baser  ri  r   embedding_foldertemp_dbembedding_file_pathr5   r5   r6   create_and_save_embeddings     

r  c                 C   s@   t |}tdd}tj| |d}d}|| td|  |S )Nr&   r  r  faiss_supplier_indexu3   ✅ Created FAISS vectorstore in memory for client )r3   r   r   r  r  r   )r  rA  r  vectorstorefaiss_index_pathr5   r5   r6   create_embeddings  s   

r  faiss_chunksc           
   	   C   s   |du r	t dd}tj|dd g }t| D ]F\}}tj|d| }tj|dd t|g|}|| |	| t
tj|dd}	t||	 W d   n1 sWw   Y  qtd	t|  d
| d |S )a  
    Save each chunk in its own FAISS vector store directory.

    Args:
        documents (List[Document]): List of LangChain Document objects.
        base_path (str): Base directory to store all FAISS chunks.
        embedding_model: Optional embedding model instance.
        api_key (str): Required if embedding_model is not passed.

    Returns:
        List[str]: List of FAISS chunk folder paths.
    Nr&   r  Tr  chunk_zdoc_metadata.pklwbu
   ✅ Saved z( chunks as individual FAISS indexes in 'rG   )r   r  r  r=  r  rL   r   r  r  r:  r  pickledumpr   r   )
r  	base_pathembedding_modelapi_keychunk_pathsr   r   	chunk_dirvector_storefr5   r5   r6   save_faiss_per_chunk  s    


r  c           	      C   r  r  r  r  r5   r5   r6   create_and_save_embeddings_new;  r  r  r  c                 C   s<  t dd}d }| d|  }dd t|D }t|dd d}|D ]$}tj||}td	|  tj||d
d}	|d u rA|	}q"|	|	 q"|d urS|
| d t| |D ]B}tj||}zt| td|  W qY ty   td|  Y qY ty }
 ztd| d|
  W Y d }
~
qYd }
~
ww |S )Nr&   r  r  c                 S   .   g | ]}| d r|td d  r|qS r  N
startswithr   r   rs   folderr5   r5   r6   rw   d      z$merge_all_faiss1.<locals>.<listcomp>c                 S      t | ddS Nr  rI   rA   rM   r   r5   r5   r6   r   j  r   z"merge_all_faiss1.<locals>.<lambda>r
  Loading FAISS index from: Tallow_dangerous_deserialization/merged_faissDeleted FAISS index folder: Folder not found: Error deleting r  )r   r  listdirr   r  rL   r   r   
load_local
merge_fromr  shutilrmtreeFileNotFoundErrorOSError)rA  r  r  merged_faissfolder_pathfaiss_folderssorted_foldersr  
faiss_pathcurrent_faissrX   r5   r5   r6   merge_all_faiss1[  s:   

 r  c                    sT  t dd}d }| d|  }dd t|D }t|dd d}|D ]0}tj||}td	|  tj||d
d  fdd j	
 D }	|d u rM }q"||	 q"|d urc|| d td |D ]B}tj||}zt| td|  W qe ty   td|  Y qe ty }
 ztd| d|
  W Y d }
~
qed }
~
ww |S )Nr&   r  r  c                 S   r  r  r  r  r5   r5   r6   rw     r  z#merge_all_faiss.<locals>.<listcomp>c                 S   r  r  r  r   r5   r5   r6   r     r   z!merge_all_faiss.<locals>.<lambda>r
  r  Tr  c                    s   g | ]	} j |jqS r5   )r,  searchr   )rs   doc_idr  r5   r6   rw     s    r  z(Merged FAISS index saved as merged_faissr  r  r  r  )r   r  r  r   r  rL   r   r   r  index_to_docstore_idr.  	add_textsr  r  r  r  r  )rA  r  r  r  r  faiss_filessorted_filesr  r  current_textsrX   r5   r  r6   merge_all_faiss  s@   


 r  )YouTubeTranscriptApic                 C   s   |  dd }t|}t| d}|D ]
}|d|d  7 }qt|ddd}|| W d    n1 s6w   Y  td	|  |S )
N=r   rI   rF   r^   r   r  r  zTranscript saved to )r   r   get_transcriptr   r  r  )youtube_video_urlvideo_idtranscript_text
transcriptr   r  r5   r5   r6   
transcribe  s   
r  c                 C   rm  rn  )r   ro  rp  r5   r5   r6   load_txt  rt  r  c                 C   rx  )ry  rz  r{  c                 S   r}  r5   r   r   r5   r5   r6   rw     r~  r  r  r  r  r5   r5   r6   r    s   c                 C   s&   t t d}|| }t|\}}}|S )N)artifact_dict)r    r!   r"   )r  	converterrenderedr^   r   imagesr5   r5   r6   parsing  s   r  c                 C   s&   t | }||}td t| |S )NCCC)r$   
split_textr   )headers_to_split_onr  markdown_splittermd_header_splitsr5   r5   r6   marks_splitter  s
   
r  <   c                    s|  g }t | D ])\}}t|dr|j}nt|dr|j}nt|tr$|}nt|}|||f q fdd|D }|s=i S tdd |D }i }	|}
|D ]\}}t|| }t	dt
|| }||	|< |
|8 }
qL|
dk rt	|	|	jd	}|	| dkr|	|  d8  < |
d7 }
nn|
dk sl|
dkrt|d
d dd}tt|
t|D ]}|| d }|	|dd |	|< |
d8 }
q|
dks|	S )ae  
    Allocate slides based on chunk size (works on Document-like objects).

    Args:
        chunks: List of Document (or string) objects
        total_slides: Total number of slides to generate
        min_chars: Minimum character count to consider a chunk valid

    Returns:
        Dict mapping original chunk index to number of slides to generate
    r   r  c                    s$   g | ]\}}t | kr||fqS r5   r   )rs   r   txt	min_charsr5   r6   rw     s   $ z#allocate_slides.<locals>.<listcomp>c                 s   s    | ]	\}}t |V  qd S r   r  )rs   r   r  r5   r5   r6   r   	  s    z"allocate_slides.<locals>.<genexpr>r   r   r
  c                 S   s   t | d S r   r  r   r5   r5   r6   r   !  r	  z!allocate_slides.<locals>.<lambda>Tr   )r=  r;  r   r  rJ   r3   r:  r/  r   r   r   r   r   r   r   )chunksrf  r  	extractedr   rO  r^   validtotal_charsallocations	remainingri  r  propcntmax_idxsorted_by_sizer5   r  r6   allocate_slides  sH   






r#  c                   @   sP   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd ZdS )SlideCollectionz2Container class for slides with dict-like behaviorc                 C   s
   i | _ d S r   r>   rU   r5   r5   r6   __init__.     
zSlideCollection.__init__c                 C   s   || j |< d S r   r%  )rU   r   r  r5   r5   r6   	add_slide1  s   zSlideCollection.add_slidec                 C   
   | j  S r   )r>   r9  r&  r5   r5   r6   r9  5  r(  zSlideCollection.keysc                 C   
   t | jS r   )iterr>   r&  r5   r5   r6   __iter__8  r(  zSlideCollection.__iter__c                 C   s
   | j | S r   r%  )rU   r   r5   r5   r6   __getitem__;  r(  zSlideCollection.__getitem__c                 C   r*  r   )r>   r8  r&  r5   r5   r6   r8  >  r(  zSlideCollection.itemsc                 C   r*  r   )r>   r.  r&  r5   r5   r6   r.  A  r(  zSlideCollection.valuesc                 C   r+  r   )reprr>   r&  r5   r5   r6   __repr__D  r(  zSlideCollection.__repr__N)r0   r1   r2   __doc__r'  r)  r9  r-  r.  r8  r.  r0  r5   r5   r5   r6   r$  ,  s    r$  returnc                 C   s   t | dr	| j}nt| }| }d|v sd|v r3|d}|dd }|dkr3|dkr3||| }td	d
|}zt	|}d|v rLt
|d W S W |S    Y |S )Nr  zHere's another attemptzI apologizer  r  r   r  r   z	'(\w+)\":z"\1":
properties)r;  r  r3   rN   r<  rfindr}   subrO   rP   r?  )
ai_messager^   r  r`  parsedr5   r5   r6   quick_json_fixG  s&   


r8  c           0         s  zd}t tdt|}	|dkrtdtd}
ntddd}
d	d
 dd
 fdd
d|	B |
B B }t }d}t|D ]~}|| }| | }t	d t	| t
|dt
|dt|}t	d t	| t	d |||d}t	d t	| t|dr|j}nt|dr| n|}|di }t|trd|v r|d t| dd
 dD ]}|d| ||  |d7 }qq<t	d t	d| t	t| dtfdd  tt|j  fd!d
d}||_t	d" t	|j |rht	| |j D ]n\}}t|tr
td@i |}n|}|jrMt|j|}t	d# t	| |r3t|tr/||j| d$< q||_qt	d%| d& t|trId |j| d$< qd |_qt	d'| d( t|trcd |j| d$< qd |_qn|jD ]}t|j| trd |j| d$< qkd |j| _qk|rt	d) d*}t td}t|}tt}tj |
|d+}i }t!|j } t"dt#| d,D ]}!t#||k r\g }"t"|!t$|!d, t#| D ]2}#| |# }$|j|$ }%t|%tr|%d-d.}&|%d/g }'n|%j%}&|%j&}'|"'|& d0d1(|'  qd2(|"}(t	d3 t	|( z3||
B |B |B |(|) d4})t	d5 t	d6|)  |)|d7t#|d  < t	d8t#| d9 W q t*y[ }* zt	d:|*  W Y d }*~*qd }*~*ww qi }+d},|dkr|dkrp|| n|}-t+| D ]:\}.}|j| |+|< |.d |- dkr|,|k rd7|,d  }/|/|v r||/ |+|/< t	d;|/ d<|  |,d7 },qvn0| D ]
}|j| |+|< qt"|D ]},d7|,d  }/|/|v r||/ |+|/< qt	d=|/  qt,|+dd>W S t|jdd>W S  t*y }* z	t*d?t|* d }*~*ww )ANa  
Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

Each slide must include:
- A clear and concise **sub-heading**
- **Paragraphs** that effectively communicate the key ideas and insights
- A specific, concise **visualization suggestion**

**Context**: {query}

Focus on creating content that is both informative and engaging. Ensure each slide:
- Has a well-structured sub-heading that captures the main point
- Uses clear and concise paragraphs to communicate important information

Use a professional and creative tone throughout. Each slide should incorporate the following elements where appropriate:
- **Thought-provoking questions** to encourage reflection
- **Relevant statistics** or data points that add credibility
- **Industry insights** or emerging trends to demonstrate expertise
- **Practical examples** or case studies to illustrate key concepts
- **Calls to action** to guide the audience toward specific actions or takeaways

For the visualization suggestion:
- Provide a clear and specific description of an image that would be relevant to the slide content.
- Keep it very concise, using a maximum of 5 words.
- Focus on concrete objects, scenes, or concepts that can be easily visualized.
- Avoid abstract or overly complex ideas.
- Include the context of the topic (e.g., "Python programming logo" instead of just "Python logo").

Make sure all content is drawn exclusively from the provided context or embedded data. Avoid introducing external information not found in the source material.

{format_instructions}

CRITICAL: The output must be a valid JSON object with this EXACT structure:
{{
  "slides": {{
    "slide_1": {{
      "type": "flash",
      "subheading": "...",
      "paragraphs": ["...", "..."],
      "visualization_suggestion": "...",
      "image": null
    }},
    "slide_2": {{ ... }}
  }},
  "token_count": 0
}}

DO NOT put "token_count" inside the "slides" object. It must be at the root level.
DO NOT include any explanations or additional text - only the JSON object.
The final output must be in strict sequential order: "slide_1", "slide_2", ..., up to "slide_{num_slides}".
r   r   r   r   r   r   r   c                 S   r   )Nr   r5   r   r5   r5   r6   r     r   z/generate_slide_content_alloc1.<locals>.<lambda>c                 S   r   r   r5   r   r5   r5   r6   r     r   c                    r   r   r   r   r   r5   r6   r     r   )r   r   r   r   z;===========================================================r   r  Queryz	Query endr   rV   r>   
model_dumpr@   c                 S   s   t | dd S )Nr   r   )rA   r   )kr5   r5   r6   r     r~  r
  slide_
all_slideszslides_output
r   c                 S   r   )Nr   r   r   r   r   r   r   r   r5   r5   r6   r    s   z<generate_slide_content_alloc1.<locals>.custom_slide_sort_keyc                    r  r  r5   r  r  r5   r6   r     r	  yessImageURLr/   r  r  r  r  zMCQ Startedu/  
Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

**Context**: {context}

The MCQ must include:
- A **question** related to the context
- Exactly **4 answer options**
- A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

⚠️ **Critical Requirements**:
- ✅ Return **only valid JSON** — no explanations, headers, or extra text.
- ✅ Ensure all fields and options are enclosed in **double quotes (`"`)**.
- ✅ Do **not** use letters like "A.", "B." in the options — just the plain text.
- Dont give Here is the MCQ while generating MCQ
- Directly follow the format given below

The final output **must strictly follow** this format:
```json
{{
    "question": "<The MCQ question>",
    "options": [
        "<Option 1>",
        "<Option 2>",
        "<Option 3>",
        "<Option 4>"
    ],
    "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"
}}

{format_instructions}
)r   r   r  r,   rI   r-   r  rF   r  r  )r   r   zV======================================================================================zMCQ Result: r&  u   ✅ MCQ mcq_r'  r(  zAdded z after z [Warning] Skipping missing MCQ: r)  r*  r5   )-r   r<   r   r5  r
   r+  r   r$  r   r   getattrr3   r6  r;  r>   r:  r   rJ   r7  popr9  r)  r)   r8  r'   r.   r   r/   r7   r   r8  r	   r4  rK   r   r   r   r,   r-   r:  rL   r   rT   r=  rD   )0r  r  r   rB  rC  rD  rE  rF  rQ  rS  r   rT  r=  counter	chunk_idxnrO  r   rV   slide_itemsrawrW  rU  r   	slide_objr   rY  
mcq_parserrZ  
json_fixeroutput_fixing_parserrC   r[  r   r\  r]  r   r   titleparasr   r^  rX   rd  re  rh  ri  rc  r5   )r  r   r6   generate_slide_content_alloc1d  s6  
4








 



&



rM  c              	   C   s   t | }tdddd}|| W d    n1 sw   Y  g d}	tdddd}| }
W d    n1 s:w   Y  t|	|
}td t||dd	}td
t|  tdt|  td t||||||||}t| d S )Nzparse_data.mdr   r  r  ))#zHeader 1)z##zHeader 2)z###zHeader 3)z####zHeader 4r  Chunksd   r  zTotal chunks: zValid chunks: z
Slide allocation:)	r  r  r  r  r  r   r#  r   rM  )r  r   rB  rC  rD  rE  rF  
parse_datar  r  r  r  
allocationr   r5   r5   r6   main  s    

rS  __main__z-Chapter3-Basic-Requirement-in-the-Kitchen.pdfztemp/output_embeddingsi\B   T   r   )r   )r  NN)r  )r  )}r  r   randomr}   rO   heapq
jsonschemar   r   langchain_core.messagesr   r   numpyr0  langchain_ollamar    langchain_community.vectorstoresr   langchain_core.promptsr   crawl4air   asynciolangchain.output_parsersr	   langchain_groqr
   r   langchain_core.output_parsersr   r   langchain_core.runnablesr   $langchain_community.document_loadersr   r   r   r   PyPDF2r   $langchain_experimental.text_splitterr   pydanticr   r   typingr   r   r   urllib.parser   r   r   r  pptxr   r   r   marker.converters.pdfr    marker.modelsr!   marker.outputr"   pathlibr#   langchain_text_splittersr$   r+  OLLAMA_MODELr   r'   r7   r<   rB   rD   rE   r`   r   r   r|   r   r   r   r   r   rl  rs  rw  r  r  r  r  r  r  r  r  r  r  r  r  r  youtube_transcript_apir   r  r  r  r  r#  r$  r3   r8  rM  rS  r0   r  r  r  rA  r   rC  rB  rD  rE  rF  r5   r5   r5   r6   <module>   s    	


  X

%
 
00
B  

