o
    UiZ                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlZd dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlm Z m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z-m.Z. d dl/m0Z0 d dl+m1Z1m.Z. d dlm2Z2 d dl3Z3d dl4m5Z5 d dl6m7Z7 d dl6m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlmZ d dlCmDZD d dlEmFZF dZGG dd  d e)ZHG d!d" d"e)ZIG d#d$ d$e)ZJG d%d& d&e)ZKG d'd( d(e)ZLd)d* ZMd+d, ZNd-d. ZOd/d0 ZPd1d2 ZQd3d4 ZRd5d6 ZSd7d8 ZTd9d: ZUd;d< ZVdBd>d?ZWd@dA ZXdS )C    N)validateValidationError)	AIMessage)OllamaEmbeddings)FAISS)ChatPromptTemplate)AsyncWebCrawler)OutputFixingParser)PromptTemplate)StrOutputParserPydanticOutputParser)RunnablePassthrough)PyPDFLoader
TextLoaderDocx2txtLoaderUnstructuredPowerPointLoader)	PdfReader)SemanticChunker)	BaseModelField)ListOptionalDict)urlparse)Unionr   )RunnableLambda)Presentation)	OllamaLLM)
ChatOllama)PdfConverter)create_model_dict)text_from_rendered)Path)MarkdownHeaderTextSplitter)YouTubeTranscriptApi)fetch_image_for_slidenomic-embed-textc                   @   sz   e Zd ZU edZeed< edddZee ed< edddZ	e
e ed	< edd
dZeed< edddZee ed< dS )SlideContentflashtypeNz$An optional subheading for the slidedescription
subheading.z(List of paragraphs for the slide content
paragraphsz^A detailed suggestion for a relevant visualization or image (5-8 words with specific elements)visualization_suggestionzURL of the image for the slideimage)__name__
__module____qualname__r   r)   str__annotations__r,   r   r-   r   r.   r/    r5   r5   Q/var/www/eduai.edurigo.com/storigo/production/generate_storigo_content_youtube.pyr'   -   s   
 r'   c                   @   s^   e Zd ZU edZeed< edddZeed< edddZe	e ed< edd	dZ
eed
< dS )
MCQContentQuestionr)   .zThe multiple-choice questionr*   questionzA list of 4 answer optionsoptionsz0The correct answer (e.g., 'a', 'b', 'c', or 'd')correct_answerN)r0   r1   r2   r   r)   r3   r4   r9   r:   r   r;   r5   r5   r5   r6   r7   4   s
   
 r7   c                   @   s>   e Zd ZU edddZeeef ed< edddZ	e
ed< dS )StorigoContent.7Dictionary of slide contents with slide numbers as keysr*   slides/Total token count for all the generated contenttoken_countN)r0   r1   r2   r   r>   r   r3   r'   r4   r@   intr5   r5   r5   r6   r<   :   s   
 r<   c                   @   sZ   e Zd ZU edddZeeef ed< edddZ	eee
f ed< edddZeed< d	S )
StorigoContentMCQ.r=   r*   r>   z8Dictionary of MCQs with identifiers like 'mcq_1' as keysmcqsr?   r@   N)r0   r1   r2   r   r>   r   r3   r'   r4   rC   r7   r@   rA   r5   r5   r5   r6   rB   >   s   
 rB   c                   @   sF   e Zd ZU edddZeeeee	f f e
d< edddZee
d< dS )StorigoContentMCQMid.zYDictionary of slide contents with slide numbers as keys and MCQs with MCQ numbers as keysr*   r>   r?   r@   N)r0   r1   r2   r   r>   r   r3   r   r'   r7   r4   r@   rA   r5   r5   r5   r6   rD   C   s   
 $rD   c                 C   s   t d| }t|S )Nz\w+)refindalllen)texttokensr5   r5   r6   count_tokensG   s   rJ   c              	      s   t dd4 I d H H}|j| dI d H }t| }|j|jdd }|ds+|d7 }t|d}||j	 W d    n1 sAw   Y  |W  d   I d H  S 1 I d H sXw   Y  d S )NT)verbose)url/_z.txtw)
r   arunr   netlocpathreplaceendswithopenwritemarkdown)filecrawlerresult
parsed_urlfilenametxt_filer5   r5   r6   	crawlerrrK   s   
0r^   c                 C   sV   | rt j| s
dS t| ddd}| }W d    n1 s w   Y  |r)|S dS )N rutf-8encoding)osrR   isfilerU   read)inputrX   contentr5   r5   r6   read_file_urlV   s   
ri   c              
   C   s   d| v r|  dd  dd }nd| v r"|  dd  dd }ntdz5t|}d	}|D ]
}|d
|d  7 }q0t|ddd}|| W d    |W S 1 sTw   Y  |W S  tys } ztd| dt| d }~ww )Nz	youtu.be/   ?r   zyoutube.com/watch?v=zv=&zInvalid YouTube URL formatr_    rH   rO   ra   rb   z0Unable to retrieve transcript for YouTube video zI. The video may not have subtitles enabled or may be unavailable. Error: )split
ValueErrorr$   get_transcriptrU   rV   	Exceptionr3   )youtube_video_urlvideo_idtranscript_text
transcriptifer5   r5   r6   
transcribe]   s(   

ry   c                 C   s   t | }| S N)r   load)	file_pathloaderr5   r5   r6   load_txtq   s   r~   c                 C   s$   t |dd}|dd | D }|S )N
percentilebreakpoint_threshold_typec                 S   s   g | ]}|j qS r5   page_content.0docr5   r5   r6   
<listcomp>y   s    z4split_text_with_semantic_chunker.<locals>.<listcomp>)r   create_documentsdocs
embeddingstext_splitter	documentsr5   r5   r6    split_text_with_semantic_chunkeru   s
   r   c                 C   s>   d}t ddd}tdg|d}||B }|d| i}|j}|S )Nak  
    Extract only the meaningful content from the text below. Focus on descriptions, value propositions, mission statements,
    features, and anything that provides valuable information about the company, products, or services. Ignore any URLs,
    navigation links, contact forms, or irrelevant sections.

    Here is the content to process:

    {context}
    http://127.0.0.1:11434
gemma3:12b)base_urlmodelcontext)input_variablestemplate)r   r
   invokerh   )rh   prompt_templatellmpromptrunnablefiltered_contentr5   r5   r6   clean_using_llm|   s   
r   c                 C   sf   t |dd}t| tr| g} t| d trdd | D } tdd | D s'g S |dd | D }|S )	Nr   r   r   c                 S   s   g | ]}d |iqS r   r5   r   r5   r5   r6   r          z<split_text_with_semantic_chunker_for_url.<locals>.<listcomp>c                 s   s"    | ]}t |tod |v V  qdS )r   N)
isinstancedictr   r5   r5   r6   	<genexpr>   s     z;split_text_with_semantic_chunker_for_url.<locals>.<genexpr>c                 S   s   g | ]}|d  qS r   r5   r   r5   r5   r6   r      r   )r   r   r3   allr   r   r5   r5   r6   (split_text_with_semantic_chunker_for_url   s   
r   c           	      C   s   t |}tjd|}tj|dd tdd}t| ddD ](\}}tj|}tj|dd tj|g|d}tj|d	| }|	| qd S )
Nmy_embeddingsT)exist_okr&   r   rj   start	embeddingfaiss_index)
r3   rd   rR   joinmakedirsr   	enumerater   from_documents
save_local)	split_documents	client_idembedding_folder_baser   idxr   embedding_foldertemp_dbembedding_file_pathr5   r5   r6   create_and_save_embeddings   s   
r   c                 C   s*   t dd}tj| |d}d}|| |S )Nr&   r   r   faiss_supplier_index)r   r   r   r   )r   r   r   vectorstorefaiss_index_pathr5   r5   r6   create_embeddings   s
   

r   r   c                    s  t dd}d }| d|  }dd t|D }t|dd d}|D ])}tj||}tj||d	d
  fdd j	 D }	|d u rF }q"|
|	 q"|d urX|| d |D ]*}tj||}zt| W qZ tys   Y qZ ty }
 zW Y d }
~
qZd }
~
ww |S )Nr&   r   rM   c                 S   s.   g | ]}| d r|td d  r|qS )r   N)
startswithrG   isdigit)r   folderr5   r5   r6   r      s    z#merge_all_faiss.<locals>.<listcomp>c                 S   s   t | ddS )Nr   r_   )rA   rS   xr5   r5   r6   <lambda>   s    z!merge_all_faiss.<locals>.<lambda>keyT)allow_dangerous_deserializationc                    s   g | ]	} j |jqS r5   )docstoresearchr   )r   doc_idcurrent_faissr5   r6   r      s    z/merged_faiss)r   rd   listdirsortedrR   r   r   
load_localindex_to_docstore_idvalues	add_textsr   shutilrmtreeFileNotFoundErrorOSError)r   	base_pathr   merged_faissfolder_pathfaiss_filessorted_filesrX   
faiss_pathcurrent_textsrx   r5   r   r6   merge_all_faiss   s:   

r   c           :   
      s  zzt | }tdd}	t|}
t|
|	}t|}t||	}W n= ty] } z1td|  dt|  t	| I d H }t
|}|sEtdt|}tdd}	t||	}W Y d }~nd }~ww t|| t|}tdddd	}t|jj  t }d8ddtfdd D }g }|krtjd|d td} fdd|D }nldd  D }t|

fdd|D }dd |D }t| }d}|dkr|dkr||  d7  < |d8 }n|dk r|| dkr||  d8  < |d7 }|d | }|dkst |D ]\}}||g|  qddd |d  D d}ttd}tj||dt|} fdddd fddd | B |B B }!|! d!d"}"t!|"d#rZ|"j"}#nt!|"d$rd|"# n|"}$|$$d#i }#t%|#tri }%t&|#dd%D ]\}}&|&|%d&| < qz|%}#d'tfd(d)t't(|#) fd*dd+t%|tr|dk}'nt%|tr|* d,v }'nd-}'|'r) D ]\}(})|)j+rt,|(|)j+}*|*r|*|)_-qd |)_-qd |)_-qn D ]})d |)_-qd}+ D ]}&|&j. d.d.|&j/ d.|&j+ },|+t0|,7 }+q|rd/t1 		fd0d1}-|- I d H }.|. D ]}/|/d2  d.d.|/d3  d.|/d4  },|+t0|,7 }+q)i }0d}1}2}3|dkr|3dkr\|2|3 n|2}4t&	D ]*\}5}(|( |0|(< |5d |4 dkr|1|3k rd5|1d  }6|.|6 |0|6< |1d7 }1qbt2|0|+d6}7|7W S 	D ]	}(|( |0|(< qt3|3D ]}1d5|1d  }6|.|6 |0|6< qt2|0|+d6}7|7W S t|+d6}8|8W S  ty }9 z	td7t|9 d }9~9ww )9Nr&   r   zTranscript not available for z2, falling back to page content extraction. Error: z+Failed to retrieve content from YouTube URLr   r   ffffff?r   r   temperature2   c                 S   s   t dtt|  | S )Nrj   )maxroundrG   rn   )rH   words_per_slider5   r5   r6   estimate_slides   s   z7generate_slide_content_youtube.<locals>.estimate_slidesc                 3   s    | ]} |j V  qd S rz   r   r   )r   r5   r6   r      s    z1generate_slide_content_youtube.<locals>.<genexpr>r   rj   )numdtypec                    s   g | ]} | qS r5   r5   )r   rv   )
all_chunksr5   r6   r      r   z2generate_slide_content_youtube.<locals>.<listcomp>c                 S   s   g | ]	}t |j qS r5   )rG   r   rn   )r   cr5   r5   r6   r          c                    s   g | ]}|   qS r5   r5   )r   rO   )
num_slidestotal_wordsr5   r6   r      s    c                 S   s   g | ]	}t d t|qS )rj   )r   r   )r   r   r5   r5   r6   r     r   z

c                 s   s    | ]}|j V  qd S rz   r   r   r5   r5   r6   r     s    u  
Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

STRICT RULES:
- ❗ ONLY use the content provided in the 'context' below.
- ❌ DO NOT introduce any external knowledge, definitions, or examples not present in the context.
- ⚠️ Do not assume common sense or use general facts. Stick to the exact information given.
- ⚠️ Avoid generic phrases like "as we know", "in general", or "in this video".

Each slide must include:
- A clear and concise **sub-heading**
- **Exactly 2–4 concise paragraphs** derived solely from the context
- A **detailed visualization suggestion** (5-8 words, specific to the content with concrete elements)
- Include specific details: people, objects, actions, settings to make it highly unique
- CRITICAL: Each slide's visualization suggestion MUST BE COMPLETELY UNIQUE across all slides - no overlapping concepts, objects, or scenes
- If slides are related, vary all elements (people, objects, actions, settings) significantly to ensure completely different images

Important: Only output the final JSON object. No additional text, markdown, or explanation should be included.

Context:
{context}

{format_instructions}

The final output must be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}".
Each slide must contain:
- "subheading"
- "paragraphs"
- "visualization_suggestion"
)pydantic_object)parserr   c                    s    S rz   r5   r   )context_textr5   r6   r   5  s    z0generate_slide_content_youtube.<locals>.<lambda>c                 S   s   | d S )Nr   r5   r   r5   r5   r6   r   6      c                    s      S rz   )get_format_instructionsr   )r   r5   r6   r   7  r   )r   r   format_instructionsr_   )queryr   r>   
model_dumpr   slide_item_keyc                 S   sL   |  dd\}}|dkrd}n	|dkrd}nd}| r t|nd}||fS )NrN   rj   slider   mcqi  )rn   r   rA   )r   prefixnum_strgroupnumberr5   r5   r6   custom_slide_sort_keyN  s   z=generate_slide_content_youtube.<locals>.custom_slide_sort_keyc                    s    | d S )Nr   r5   )kv)r   r5   r6   r   Y  s    r   )1trueyes0Frm   ur  
Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

**Context**: {context}

The MCQ must include:
- A **question** related to the context
- Exactly **4 answer options**
- A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

⚠️ **Critical Requirements**:
- ✅ Return **only valid JSON** — no explanations, headers, or extra text.
- ✅ Ensure all fields and options are enclosed in **double quotes (`"`)**.
- ✅ Do **not** use letters like "A.", "B." in the options — just the plain text.
- Dont give Here is the MCQ while generating MCQ
- Directly follow the format given below

The final output **must strictly follow** this format:
```json
{{
    "question": "<The MCQ question>",
    "options": [
        "<Option 1>",
        "<Option 2>",
        "<Option 3>",
        "<Option 4>"
    ],
    "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"

    Always give "question","options","correct_answer" these labels in double quotes only
}}
c               	      s  t tdddd i } g }tdtdD ]L}t| kr# nCg }t|t|d tD ]}|  }||j dd|j	  q1d	|} fd
d}t|k re|||}|| qt
j|ddiI d H }	|	D ]}
t|
tr|
rdt| d  }|
| |< qs| S )Nr   r   r   r   r      z: rm   
c              
      sb  zt  d  fddI d H }t|dr|j}nt|}|d}|dkrjd}d}t||d  |dD ]\}}|dkrD|d7 }q7|d	krT|d8 }|dkrT|} nq7|dkrg|||d  }	t	|	}
nW d S W d S |	rs|	
 svW d S |
d
d|
dg |
ddd}|d
 rt|d dkr|d r|W S W d S  ty } zW Y d }~d S d }~ww )Nc                      s   t  fddB B i S )Nc                    s   d iS )Nr   r5   r   ctx_textr5   r6   r     r   z|generate_slide_content_youtube.<locals>.generate_mcqs_async.<locals>.generate_single_mcq.<locals>.<lambda>.<locals>.<lambda>)r   r   r5   )r  mcq_llm
mcq_promptr5   r6   r     s   zjgenerate_slide_content_youtube.<locals>.generate_mcqs_async.<locals>.generate_single_mcq.<locals>.<lambda>rh   {r   r   rj   }r9   r_   r:   r;   )r9   r:   r;      )asyncioget_event_looprun_in_executorhasattrrh   r3   findr   jsonloadsstripgetrG   rq   )r  r   
mcq_resultrh   r   open_bracesendidx_charcharcontent_onlyjson_objectformatted_mcqrx   r  r  r  r6   generate_single_mcq  sV   

	




zXgenerate_slide_content_youtube.<locals>.generate_mcqs_async.<locals>.generate_single_mcqreturn_exceptionsTmcq_rj   )r   from_templater   rangerG   minappendr,   r   r-   r  gatherr   r   )rC   tasksrv   context_slidesjr   r   r  taskresultsrZ   mcq_key)mcq_templatenum_mcqsordered_slides
slide_keysr  r6   generate_mcqs_async  s:   
 
6

z;generate_slide_content_youtube.<locals>.generate_mcqs_asyncr9   r:   r;   r  )r>   r@   z Error generating slide content: )r   )4ry   r   r~   r   r   r   rq   printr3   r^   ri   r   r   r   listr   _dictr   rG   sumnplinspacerA   zipextendr   r   r<   r	   from_llmr   r   r   r  r>   r   r  r   r   r   r   itemsr  r.   r%   r/   r,   r-   rJ   keysrD   r!  ):document_urlr   r   r,  is_imageis_questionquestion_positionGPUr|   r   rH   split_documents1meaningful_contentr   transcript_errorr\   raw_contentmerge_embeddingsr   total_chunkstotal_possibleselected_chunksindicesword_counts	raw_allocallocdiffrv   chunkcountslide_content_template
raw_parserslide_content_promptslide_content_chainrZ   slides_datarawslides_dictr   is_image_bool	slide_keyslide_content
image_pathr@   text_contentr/  rC   r   interleaved_contentmcq_countertotal_slides
total_mcqsintervalr   r*  storigo_contentstorigo_content_without_mcrx   r5   )r   r   r   r   r+  r,  r   r-  r   r.  r   r6   generate_slide_content_youtube   s  














  W&
rb  )r   )Yrd   timerandomrE   r  heapq
jsonschemar   r   langchain_core.messagesr   requestsnumpyr4  langchain_community.embeddingsr    langchain_community.vectorstoresr   langchain_core.promptsr   crawl4air   r  langchain.output_parsersr	   r
   langchain_core.output_parsersr   r   langchain_core.runnablesr   $langchain_community.document_loadersr   r   r   r   PyPDF2r   $langchain_experimental.text_splitterr   pydanticr   r   typingr   r   r   urllib.parser   r   r   r   pptxr   langchain_ollamar   r   marker.converters.pdfr   marker.modelsr    marker.outputr!   pathlibr"   langchain_text_splittersr#   youtube_transcript_apir$   storigo_image_generatorr%   OLLAMA_MODELr'   r7   r<   rB   rD   rJ   r^   ri   ry   r~   r   r   r   r   r   r   rb  r5   r5   r5   r6   <module>   sr    
