
    zKiA                     ~   S SK r S SKrS SKrS SKrS SKrS SKrS SKJrJr  S SK	J
r
  S SKrS SKrS SKJr  S SKJr  S SKJr  S SKJr  S SKrS SKJr  S S	KJr  S S
KJrJr  S SKJr  S SKJ r J!r!J"r"J#r#  S SK$J%r%  S SK&J'r'  S SK(J)r)J*r*  S SK+J,r,J-r-J.r.  S SK/J0r0  S SK+J1r1J.r.  S SKJ2r2  S SK3r3S SK4J5r5  S SK6J7r7  S SK6J8r8  S SK9J:r:  S SK;J<r<  S SK=J>r>  S SK?J@r@  S SKAJBrB  S SKJr  S SKCJDrD  SrE " S S\)5      rF " S  S!\)5      rG " S" S#\)5      rH " S$ S%\)5      rI " S& S'\)5      rJS( rKS) rLS* rMS1S+ jrN " S, S-5      rOS.\P4S/ jrQS0 rRg)2    N)validateValidationError)	AIMessage)OllamaEmbeddings)FAISS)ChatPromptTemplate)AsyncWebCrawler)OutputFixingParser)PromptTemplate)StrOutputParserPydanticOutputParser)RunnablePassthrough)PyPDFLoader
TextLoaderDocx2txtLoaderUnstructuredPowerPointLoader)	PdfReader)SemanticChunker)	BaseModelField)ListOptionalDict)urlparse)Unionr   )RunnableLambda)Presentation)	OllamaLLM)
ChatOllama)PdfConverter)create_model_dict)text_from_rendered)Path)MarkdownHeaderTextSplitter)fetch_image_for_slideznomic-embed-textc                       \ rS rSr% \" S5      r\\S'   \" SSS9r\	\   \S'   \" SS	S9r
\\   \S
'   \" SSS9r\\S'   \" SSS9r\	\   \S'   Srg)SlideContent,   flashtypeNz$An optional subheading for the slidedescription
subheading.z(List of paragraphs for the slide content
paragraphsz^A detailed suggestion for a relevant visualization or image (5-8 words with specific elements)visualization_suggestionzURL of the image for the slideimage )__name__
__module____qualname____firstlineno__r   r*   str__annotations__r-   r   r.   r   r/   r0   __static_attributes__r1       O/var/www/eduai.edurigo.com/storigo/testing/generate_storigo_content_document.pyr'   r'   ,   sn    gD# %d8^ _J_!#3]^JS	^$)#  <\  %]c  ] 3STE8C=Tr9   r'   c                   ~    \ rS rSr% \" S5      r\\S'   \" SSS9r\\S'   \" SSS9r	\
\   \S	'   \" SS
S9r\\S'   Srg)
MCQContent3   Questionr*   .zThe multiple-choice questionr+   questionzA list of 4 answer optionsoptionsz0The correct answer (e.g., 'a', 'b', 'c', or 'd')correct_answerr1   N)r2   r3   r4   r5   r   r*   r6   r7   r?   r@   r   rA   r8   r1   r9   r:   r<   r<   3   sK    j!D#!#+IJHcJs0LMGT#YM1cdNCdr9   r<   c                   P    \ rS rSr% \" SSS9r\\\4   \	S'   \" SSS9r
\\	S'   Srg	)
StorigoContent9   .7Dictionary of slide contents with slide numbers as keysr+   slides/Total token count for all the generated contenttoken_countr1   N)r2   r3   r4   r5   r   rF   r   r6   r'   r7   rH   intr8   r1   r9   r:   rC   rC   9   s0    &+C=v&wFDl"#wS._`K`r9   rC   c                   r    \ rS rSr% \" SSS9r\\\4   \	S'   \" SSS9r
\\\4   \	S'   \" SSS9r\\	S	'   S
rg)StorigoContentMCQ=   .rE   r+   rF   z8Dictionary of MCQs with identifiers like 'mcq_1' as keysmcqsrG   rH   r1   N)r2   r3   r4   r5   r   rF   r   r6   r'   r7   rM   r<   rH   rI   r8   r1   r9   r:   rK   rK   =   sK    &+C=v&wFDl"#w"'9s"tD$sJ
tS._`K`r9   rK   c                   Z    \ rS rSr% \" SSS9r\\\\	\
4   4   \S'   \" SSS9r\\S'   Srg	)
StorigoContentMCQMidB   .zYDictionary of slide contents with slide numbers as keys and MCQs with MCQ numbers as keysr+   rF   rG   rH   r1   N)r2   r3   r4   r5   r   rF   r   r6   r   r'   r<   r7   rH   rI   r8   r1   r9   r:   rO   rO   B   sD    9>s  Ql  :mFDeL*4556  mS._`K`r9   rO   c                 F    [         R                  " SU 5      n[        U5      $ )Nz\w+)refindalllen)texttokenss     r:   count_tokensrW   F   s    ZZ%Fv;r9   c                 T    [        [        5       S9nU" U 5      n[        U5      u  p4nU$ )N)artifact_dict)r    r!   r"   )input	converterrenderedrU   _imagess         r:   parsingr_   J   s-    +<+>?IH(2ODVKr9   c                 >    [        U 5      nUR                  U5      nU$ N)r$   
split_text)headers_to_split_oncontentmarkdown_splittermd_header_splitss       r:   marks_splitterrg   P   s%    23FG(33G<r9   c                 \   / n[        U 5       Hv  u  pE[        US5      (       a  UR                  nOA[        US5      (       a  UR                  nO#[	        U[
        5      (       a  UnO[        U5      nUR                  XF45        Mx     U VVs/ s H  u  pG[        U5      U:  d  M  XG4PM     nnnU(       d  0 $ [        S U 5       5      n	0 n
UnU H2  u  p[        U5      U	-  n[        S[        X-  5      5      nXU'   X-  nM4     US:  a6  [        XR                  S9nX   S:  a  X==   S-  ss'   US-  nOOUS:  a  M6  US:  a]  [        US SS	9n[        [        U[        U5      5      5       H'  nUU   S   nU
R                  US5      S-   X'   US-  nM)     US:  a  M]  U
$ s  snnf )
Npage_contentrd   c              3   <   #    U  H  u  p[        U5      v   M     g 7fra   rT   ).0r]   txts      r:   	<genexpr>"allocate_slides.<locals>.<genexpr>f   s     3U61c#hhUs      r   keyc                     [        U S   5      $ )Nrp   rk   xs    r:   <lambda>!allocate_slides.<locals>.<lambda>y   s    S1Yr9   T)rr   reverse)	enumeratehasattrri   rd   
isinstancer6   appendrT   summaxroundgetsortedrangemin)chunkstotal_slides	min_chars	extractedichunkrU   rm   validtotal_charsallocations	remainingidxpropcntmax_idxsorted_by_sizes                    r:   allocate_slidesr   U   s   If%5.))%%DUI&&==Ds##Du:D!# & %.GI&!SY1FXaXIEG	3U33KKI3x+%!U4./0C		  a-k7!# A% NI a- a-+>Ms9c.&9:;A #A&C*sA6:KNI < a- ; Hs   F(&F(c                   D    \ rS rSrS rS rS rS rS rS r	S r
S	 rS
rg)SlideCollection   c                     0 U l         g ra   rF   selfs    r:   __init__SlideCollection.__init__   s	    r9   c                      X R                   U'   g ra   r   )r   rr   rd   s      r:   	add_slideSlideCollection.add_slide   s    "Cr9   c                 6    U R                   R                  5       $ ra   )rF   keysr   s    r:   r   SlideCollection.keys   s    {{!!r9   c                 ,    [        U R                  5      $ ra   )iterrF   r   s    r:   __iter__SlideCollection.__iter__       DKK  r9   c                      U R                   U   $ ra   r   )r   rr   s     r:   __getitem__SlideCollection.__getitem__   s    {{3r9   c                 6    U R                   R                  5       $ ra   )rF   itemsr   s    r:   r   SlideCollection.items   s    {{  ""r9   c                 6    U R                   R                  5       $ ra   )rF   valuesr   s    r:   r   SlideCollection.values   s    {{!!##r9   c                 ,    [        U R                  5      $ ra   )reprrF   r   s    r:   __repr__SlideCollection.__repr__   r   r9   r   N)r2   r3   r4   r5   r   r   r   r   r   r   r   r   r8   r1   r9   r:   r   r      s*    #"! #$!r9   r   returnc                    [        U S5      (       a  U R                  nO[        U 5      nUR                  5       nSU;   d  SU;   a5  UR	                  S5      nUR                  S5      S-   nUS:w  a
  US:w  a  XU n[        R                  " S	S
U5      n [        R                  " U5      nSU;   a  [        R                  " US   5      $  U$ !    U$ = f)Nrd   zHere's another attemptzI apologize{}rp   r   z	'(\w+)\":z"\1":
properties)rz   rd   r6   stripfindrfindrR   subjsonloadsdumps)
ai_messagerU   startendparseds        r:   quick_json_fixr      s    z9%%!!:::<D4'=D+@		#jjo!B;3!8c?D66,$/DD!6!::f\233 "
 KKs   4C Cc                 *  ^7^8#     [         R                  " 5       nUR                  S [        U 5      I S h  vN n	[	        SSSS9 n
U
R                  U	5        S S S 5        SS/n[	        SSSS9 n
U
R                  5       nS S S 5        [        UW5      n[        XSS	9nS
n[        [        S9m8[        R                  " U5      n[        SSS9nS S U84S jS.U-  U-  T8-  n[        5       nSn[        U5       H  nUU   nUU   n[!        US[!        US[#        U5      5      5      nUR%                  UUS.5      n['        US5      (       a  UR(                  nO5['        US5      (       a  UR+                  5       OUnUR-                  S0 5      n[/        U[0        5      (       a  SU;   a  UR3                  S5        [        UR5                  5       S S9 H   nUR7                  SU 3UU   5        US-  nM"     GM     S["        4S jm7[1        [        UR(                  R9                  5       U74S jS95      nUUl        U(       Ga
  UR(                  R9                  5        H  u  nn[/        U[0        5      (       a  [;        S/0 UD6nOUnUR<                  (       a  [?        UUR<                  5      n U (       a2  [/        U[0        5      (       a  U UR(                  U   S '   M  U Ul         M  [/        U[0        5      (       a  S UR(                  U   S '   M  S Ul         M  [/        U[0        5      (       a  S UR(                  U   S '   M  S Ul         M     O]UR(                   HM  n[/        UR(                  U   [0        5      (       a  S UR(                  U   S '   M9  S UR(                  U   l         MO     U(       Gac  S!n![        [B        S9n"[        R                  " U!5      n#[E        [F        5      n$[H        RJ                  " UU"S"9n%0 n&[M        UR(                  R5                  5       5      n'[O        S#[Q        U'5      S$5       GH  n([Q        U&5      U:  d  M  / n)[O        U([S        U(S$-   [Q        U'5      5      5       H  n*U'U*   n+UR(                  U+   n,[/        U,[0        5      (       a%  U,R-                  S%S&5      n-U,R-                  S'/ 5      n.OU,RT                  n-U,RV                  n.U)RY                  U- S(S)R[                  U.5       35        M     S*R[                  U)5      n/ U#U-  U$-  U%-  R%                  U/U"R]                  5       S+.5      n0U0U&S,[Q        U&5      S-    3'   GM     0 n2S#n3US:X  ae  US#:  a  X#-  OUn4[a        U'5       HI  u  n5nUR(                  U   U2U'   U5S-   U4-  S#:X  d  M&  U3U:  d  M.  S,U3S-    3n6U6U&;   a  U&U6   U2U6'   U3S-  n3MK     OEU' H  nUR(                  U   U2U'   M     [O        U5       H  n3S,U3S-    3n6U6U&;   d  M  U&U6   U2U6'   M     [c        U2S#S-9$ [        UR(                  S#S-9$  GN! , (       d  f       GN= f! , (       d  f       GN= f! [^         a  n1 S n1A1GM+  S n1A1ff = f! [^         a  n1[_        S.[#        U15       35      eS n1A1ff = f7f)0Nzparse_data.mdwzutf-8)encoding)#zHeader 1)z##zHeader 2rd   )r   a  
Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

Each slide must include:
- A clear and concise **sub-heading**
- **Paragraphs** that effectively communicate the key ideas and insights
- A specific, concise **visualization suggestion**

**Context**: {query}

Focus on creating content that is both informative and engaging. Ensure each slide:
- Has a well-structured sub-heading that captures the main point
- Uses clear and concise paragraphs to communicate important information

Use a professional and creative tone throughout. Each slide should incorporate the following elements where appropriate:
- **Thought-provoking questions** to encourage reflection
- **Relevant statistics** or data points that add credibility
- **Industry insights** or emerging trends to demonstrate expertise
- **Practical examples** or case studies to illustrate key concepts
- **Calls to action** to guide the audience toward specific actions or takeaways

For the visualization suggestion:
- Provide a clear and specific description of an image that would be relevant to the slide content.
- Keep it very concise, using a maximum of 5 words.
- Focus on concrete objects, scenes, or concepts that can be easily visualized.
- Avoid abstract or overly complex ideas.
- Include the context of the topic (e.g., "Python programming logo" instead of just "Python logo").

Make sure all content is drawn exclusively from the provided context or embedded data. Avoid introducing external information not found in the source material.

{format_instructions}

CRITICAL: The output must be a valid JSON object with this EXACT structure:
{{
  "slides": {{
    "slide_1": {{
      "type": "flash",
      "subheading": "...",
      "paragraphs": ["...", "..."],
      "visualization_suggestion": "...",
      "image": null
    }},
    "slide_2": {{ ... }}
  }},
  "token_count": 0
}}

DO NOT put "token_count" inside the "slides" object. It must be at the root level.
DO NOT include any explanations or additional text - only the JSON object.
The final output must be in strict sequential order: "slide_1", "slide_2", ..., up to "slide_{num_slides}".
)pydantic_objectzhttp://127.0.0.1:11434z
gemma3:12b)base_urlmodelc                     U S   $ )Nqueryr1   rt   s    r:   rv   1generate_slide_content_document.<locals>.<lambda>  s    1W:r9   c                     U S   $ )N
num_slidesr1   rt   s    r:   rv   r     s    ,r9   c                 $   > TR                  5       $ ra   )get_format_instructions)ru   parsers    r:   rv   r     s    1O1O1Qr9   )r   r   format_instructionsrp   ri   rd   )r   r   rF   
model_dumprH   c                 <    [        U R                  S5      S   5      $ )Nr]   rp   )rI   split)ks    r:   rv   r     s    c!''RU,WX/FZr9   rq   slide_item_keyc                     U R                  SS5      u  pUS:X  a  SnOUS:X  a  SnOSnUR                  5       (       a  [        U5      OSnX44$ )Nr]   rp   slider   mcqi  )r   isdigitrI   )r   prefixnum_strgroupnumbers        r:   custom_slide_sort_key>generate_slide_content_document.<locals>.custom_slide_sort_key"  sQ    &nnS!4OF 5%,__%6%6S\CF?"r9   c                    > T" U S   5      $ )Nr   r1   )kvr   s    r:   rv   r   -  s    OdeghiejOkr9   r0   u  
Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

**Context**: {context}

The MCQ must include:
- A **question** related to the context
- Exactly **4 answer options**
- A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

⚠️ **Critical Requirements**:
- ✅ Return **only valid JSON** — no explanations, headers, or extra text.
- ✅ Ensure all fields and options are enclosed in **double quotes (`"`)**.
- ✅ Do **not** use letters like "A.", "B." in the options — just the plain text.
- Directly follow the format given below

The final output **must strictly follow** this format:
```json
{{
    "question": "<The MCQ question>",
    "options": [
        "<Option 1>",
        "<Option 2>",
        "<Option 3>",
        "<Option 4>"
    ],
    "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"
}}

{format_instructions}
)llmr   r      r-    r.   z:  
)contextr   mcq_)rF   rH   z Error generating slide content: r1   )2asyncioget_event_looprun_in_executorr_   openwritereadrg   r   r   rC   r   from_templater   r   r   getattrr6   invokerz   rF   r   r   r{   dictpopr   r   r   r'   r/   r%   r0   r<   r   r   r
   from_llmlistr   rT   r   r-   r.   r|   joinr   	Exceptionry   rO   )9temp_file_path	client_idr   num_mcqsis_imageis_questionquestion_positionGPUloop
parse_datafrc   rd   r   
allocationslide_content_templateslide_content_promptr   slide_content_chain
all_slidescounter	chunk_idxnr   r   resultslide_itemsraw	slide_keyordered_slidesslide_content	slide_obj
image_pathmcq_template
mcq_parser
mcq_prompt
json_fixeroutput_fixing_parserrM   
slide_keysr   context_slidesjrr   r   titleparascontext_text
mcq_resulteinterleaved_contentmcq_counterintervalr   mcq_keyr   r   s9                                                          @@r:   generate_slide_content_documentr,     s~    CE%%'//g~NN
/39QGGJ : 
 /39QffhG : 3W=$V3G
2"h &nE1??@VW-
 .7'Q
 ##   	 %&

+I9%A9%EE>75)SQVZ3XYE(//%q0QRFvx(($mm-4V\-J-Jf'')PV!ggh3+t,,+1M.#K$4$4$6<Z[	$$vgY%7Y9OP1 \! ,(		#C 		# fZ%6%6%<%<%>Dklm*
,6,=,=,C,C,E(	=mT22 , =} =I -I55!6!!::"J "%mT::DNJ--i8A2<M/%mT::DHJ--i8A26M/!-66@D
)))4W=.2+3 -F6 (..	j//	:DAA<@J%%i099=J%%i06	 / L> .jIJ+99,GJ'7J#5#>#>3z#Z Dj//4467J1c*oq1t9x'%'N"1c!a%Z&AB(m * 1 1# 6%eT22$)IIlB$?E$)IIlB$?E$)$4$4E$)$4$4E&--r#((5/9J.KL C $(99^#<L!&!"() 33 !&'33=3U3U3W"  # 8BtCIM?349 2@ #%K A%5=\:1z&/
&;NC5?5F5Fy5Q'	2a8+q0[85K$(q(9":"d?;?=/8#q( '< ",I5?5F5Fy5Q'	2 ", $)?K $[1_$56G$7;G}+G4 $3
 (/BPQRR!):):JJ{ O99 :9r % ! !<  E:3q6(CDDEs   Z/Y* X*Y* X-Y* .X??O	Y* CY* ?YAY* Y* 'AY*  Y* ZY* )Z*Y* -
X<7Y* ?
Y	Y* 
Y'Y* "Y''Y* *
Z4ZZZ)<   )SostimerandomrR   r   heapq
jsonschemar   r   langchain_core.messagesr   requestsnumpynplangchain_community.embeddingsr    langchain_community.vectorstoresr   langchain_core.promptsr   crawl4air	   r   langchain.output_parsersr
   r   langchain_core.output_parsersr   r   langchain_core.runnablesr   $langchain_community.document_loadersr   r   r   r   PyPDF2r   $langchain_experimental.text_splitterr   pydanticr   r   typingr   r   r   urllib.parser   r   r   shutilpptxr   langchain_ollamar   r   marker.converters.pdfr    marker.modelsr!   marker.outputr"   pathlibr#   langchain_text_splittersr$   storigo_image_generatorr%   OLLAMA_MODELr'   r<   rC   rK   rO   rW   r_   rg   r   r   r6   r   r,  r1   r9   r:   <module>rN     s    	   	   0 -   ; 2 5 $  7 1 O 8 v v  @ % ' ' !  3   & ' . + ,  ? 7 :!U9 Ue eaY aa	 a
a9 a
*X! !2# 2DEr9   