
    h              	          S SK r S SKrS SKrS SKrS SKrS SKrS SKJrJr  S SK	J
r
  S SKrS SKrS SKJr  S SKJr  S SKJr  S SKJr  S SKrS SKJr  S S	KJr  S S
KJrJr  S SKJr  S SKJ r J!r!J"r"J#r#  S SK$J%r%  S SK&J'r'  S SK(J)r)J*r*  S SK+J,r,J-r-J.r.  S SK/J0r0  S SK+J1r1J.r.  S SKJ2r2  S SK3r3S SK4J5r5  S SKJ6r6  S SKJ7r7  S SK8J9r9  S SK:J;r;  S SK<J=r=  S SK>J?r?  S SK@JArA  S SKJr  S SKBJBrB  SrC " S S\)5      rD " S  S!\)5      rE " S" S#\)5      rF " S$ S%\)5      rG " S& S'\)5      rH " S( S)\5      rIS* rJS+ rKS, rLS- rMS. rNS/ rOS0 rPS1 rQS2 rRS3 rSS4 rTS5 rUS6 rVS SKWrWSQS7 jrXS8 rYSRS9 jrZSRS: jr[S S;K\J]r]  S< r^S= r_S> rOS? r`S@ raSSSA jrb " SB SC5      rcSD\d4SE jreSF rfSG rg\hSH:X  a&  SIriSJ\i 3rjSKrkSLrlSMrmSNrnSOroSNrpSPrqSPrr\g" \i\m\o\n\p\q\r5        gg)T    N)validateValidationError)	AIMessage)OllamaEmbeddings)FAISS)ChatPromptTemplate)AsyncWebCrawler)OutputFixingParser)PromptTemplate)StrOutputParserPydanticOutputParser)RunnablePassthrough)PyPDFLoader
TextLoaderDocx2txtLoaderUnstructuredPowerPointLoader)	PdfReader)SemanticChunker)	BaseModelField)ListOptionalDict)urlparse)Unionr   )RunnableLambda)Presentation)	OllamaLLM)
ChatOllama)PdfConverter)create_model_dict)text_from_rendered)Path)MarkdownHeaderTextSplitter)storigo_image_generatornomic-embed-textc                       \ rS rSr% \" S5      r\\S'   \" SSS9r\	\   \S'   \" SS	S9r
\\   \S
'   \" SSS9r\\S'   \" SSS9r\	\   \S'   Srg)SlideContent-   flashtypeNz$An optional subheading for the slidedescription
subheading.z(List of paragraphs for the slide content
paragraphszUA specific and concise suggestion for a relevant visualization or image (max 5 words)visualization_suggestionzURL of the image for the slideimage )__name__
__module____qualname____firstlineno__r   r+   str__annotations__r.   r   r/   r   r0   r1   __static_attributes__r2       O/var/www/eduai.edurigo.com/image_generation/testing/generate_storigo_content.pyr(   r(   -   sn    gD# %d8^ _J_!#3]^JS	^$)#  <S  %Tc  T 3STE8C=Tr:   r(   c                   ~    \ rS rSr% \" S5      r\\S'   \" SSS9r\\S'   \" SSS9r	\
\   \S	'   \" SS
S9r\\S'   Srg)
MCQContent6   Questionr+   .zThe multiple-choice questionr,   questionzA list of 4 answer optionsoptionsz0The correct answer (e.g., 'a', 'b', 'c', or 'd')correct_answerr2   N)r3   r4   r5   r6   r   r+   r7   r8   r@   rA   r   rB   r9   r2   r:   r;   r=   r=   6   sK    j!D#!#+IJHcJs0LMGT#YM1cdNCdr:   r=   c                   P    \ rS rSr% \" SSS9r\\\4   \	S'   \" SSS9r
\\	S'   Srg	)
StorigoContent<   .7Dictionary of slide contents with slide numbers as keysr,   slides/Total token count for all the generated contenttoken_countr2   N)r3   r4   r5   r6   r   rG   r   r7   r(   r8   rI   intr9   r2   r:   r;   rD   rD   <   s0    &+C=v&wFDl"#wS._`K`r:   rD   c                   r    \ rS rSr% \" SSS9r\\\4   \	S'   \" SSS9r
\\\4   \	S'   \" SSS9r\\	S	'   S
rg)StorigoContentMCQC   .rF   r,   rG   z8Dictionary of MCQs with identifiers like 'mcq_1' as keysmcqsrH   rI   r2   N)r3   r4   r5   r6   r   rG   r   r7   r(   r8   rN   r=   rI   rJ   r9   r2   r:   r;   rL   rL   C   sK    &+C=v&wFDl"#w"'9s"tD$sJ
tS._`K`r:   rL   c                   Z    \ rS rSr% \" SSS9r\\\\	\
4   4   \S'   \" SSS9r\\S'   Srg	)
StorigoContentMCQMidJ   .zYDictionary of slide contents with slide numbers as keys and MCQs with MCQ numbers as keysr,   rG   rH   rI   r2   N)r3   r4   r5   r6   r   rG   r   r7   r   r(   r=   r8   rI   rJ   r9   r2   r:   r;   rP   rP   J   sD    9>s  Ql  :mFDeL*4556  mS._`K`r:   rP   c                       \ rS rSrS rSrg)CustomMCQParserN   c                    [        U[        5      (       a  SR                  U5      nUR                  SS5      nUR                  SS5      R	                  5       n [
        R                  " U5      nU R                  R                  U5      $ ! [
        R                   a  n[        S[        U5       35      eS nAf[         a  n[        S[        U5       35      eS nAff = f)N '"zHere is the MCQ: zError decoding JSON: zError parsing result: )
isinstancelistjoinreplacestripjsonloadspydantic_objectmodel_validateJSONDecodeError	Exceptionr7   )selfresultjson_objectes       r;   parse_resultCustomMCQParser.parse_resultO   s    fd##XXf%F T*  2B7==?		?**V,K ''66{CC## 	>3CF8<== 	?4SVH=>>	?s$   0B CB66CCCr2   N)r3   r4   r5   r6   ri   r9   r2   r:   r;   rS   rS   N   s    ?r:   rS   c                      [        U 5      nSnUR                   H  nX#R                  5       -  nM     U$ ! [         a  n[        S[	        U5       35      eS nAff = f)NrY   z Error extracting text from PDF: )r   pagesextract_textrd   r7   )pdf_pathreadertextpagerh   s        r;   extract_text_from_pdfrr   h   sd    E8$LLD%%''D ! E:3q6(CDDEs   36 
A AAc                 F    [         R                  " SU 5      n[        U5      $ )Nz\w+)refindalllen)rp   tokenss     r;   count_tokensrx   s   s    ZZ%Fv;r:   c           	      r
  ^^3^4^5^6^7^8^9  [        SSSS9n[        U R                  R                  R	                  5       5      n	[        U	5      n
S(S jm6[        U64S jU	 5       5      n/ nX*::  a4  [        R                  " SU
S-
  U[        S	9nU Vs/ s H  oU   PM	     nnGOU	 Vs/ s H&  n[        UR                  R                  5       5      PM(     nn[        U5      nU Vs/ s H  nUU-  U-  PM     nnU Vs/ s H  n[        S[        U5      5      PM     nnU[        U5      -
  nSnUS:w  aJ  US:  a  UU==   S-  ss'   US-  nO!US:  a  UU   S:  a  UU==   S-  ss'   US-  nUS-   U
-  nUS:w  a  MJ  [        U	U5       H  u  nnUR                  U/U-  5        M     S
R!                  S US U  5       5      m4Sn[#        [$        S9n[&        R(                  " UUS9m9[*        R,                  " U5      nU44S jS U94S jS.U-  U-  T9-  nUR/                  SUS.5      nS[0        4S jm5[3        [5        UR6                  R9                  5       U54S jS95      m8[;        U[        5      (       a  US:H  nO+[;        U[0        5      (       a  UR=                  5       S;   nOSnU(       a  T8R9                  5        Hv  u  nn U R>                  (       aI  [@        RB                  " U R>                  U 5      n!U!(       a	  U!U l"        MH  [G        SU S35        S U l"        M`  [G        SU S35        S U l"        Mx     OT8R	                  5        H
  n S U l"        M     Sn"T8R	                  5        HH  n#U#RH                   SSR!                  U#RJ                  5       SU#R>                   3n$U"[M        U$5      -  n"MJ     U(       Ga1  Sm7S  n%SS K'm3U3U7UU84S! jn&T3RP                  " U&" 5       5      n'U'R	                  5        H3  n(U(S"    SSR!                  U(S#   5       SU(S$    3n$U"[M        U$5      -  n"M5     0 n)Sn*Un+Tn,US:X  ap  U,S:  a  U+U,-  OU+n-[S        [T        5       HD  u  n.nT8U   U)U'   U.S-   U--  S:X  d  M  U*U,:  d  M$  S%U*S-    3n/[G        U/5        U'U/   U)U/'   U*S-  n*MF     [W        U)U"S&9n0U0$ [T         H  nT8U   U)U'   M     [Y        U,5       H  n*S%U*S-    3n/U'U/   U)U/'   M     [W        U)U"S&9n0U0$ [%        T8U"S&9n1U1$ s  snf s  snf s  snf s  snf ! [Z         a  n2[[        S'[1        U25       35      eS n2A2ff = f))Nhttp://127.0.0.1:11434
gemma3:12bffffff?base_urlmodeltemperaturec           	      `    [        S[        [        U R                  5       5      U-  5      5      $ N   )maxroundrv   split)rp   words_per_slides     r;   estimate_slides/generate_slide_content.<locals>.estimate_slides   s$    q%DJJL 1O CDEEr:   c              3   H   >#    U  H  nT" UR                   5      v   M     g 7fNpage_content).0docr   s     r;   	<genexpr>)generate_slide_content.<locals>.<genexpr>   s     U*3_S-=-=>>*s   "r   r   )numdtypez

c              3   8   #    U  H  oR                   v   M     g 7fr   r   r   r   s     r;   r   r      s     "\?[#3#3?[s   u=  
Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

STRICT RULES:
- ❗ ONLY use the content provided in the 'context' below.
- ❌ DO NOT introduce any external knowledge, definitions, or examples not present in the context.
- ⚠️ Do not assume common sense or use general facts. Stick to the exact information given.
- ⚠️ Avoid generic phrases like “as we know”, “in general”, or “in this video”.

Each slide must include:
- A clear and concise **sub-heading**
- **Exactly 2–4 concise paragraphs** derived solely from the context
- A **visualization suggestion** (max 5 words, specific to the content)

Important: Only output the final JSON object. No additional text, markdown, or explanation should be included.

Context:
{context}

{format_instructions}

The final output must be a valid JSON object where each slide is represented as "slide_1", "slide_2", ..., up to "slide_{num_slides}".
Each slide must contain:
- "subheading"
- "paragraphs"
- "visualization_suggestion"
ra   )parserllmc                    > T$ r   r2   )xcontext_texts    r;   <lambda>(generate_slide_content.<locals>.<lambda>   s    \r:   c                     U S   $ N
num_slidesr2   r   s    r;   r   r          ,r:   c                 $   > TR                  5       $ r   get_format_instructionsr   r   s    r;   r   r          1O1O1Qr:   )contextr   format_instructionsrY   queryr   item_keyc                     U R                  SS5      u  pUS:X  a  SnOUS:X  a  SnOSnUR                  5       (       a  [        U5      OSnX44$ N_r   slider   mcqi  r   isdigitrJ   r   prefixnum_strgroupnumbers        r;   custom_slide_sort_key5generate_slide_content.<locals>.custom_slide_sort_key   Q    &nnS!4OF 5%,__%6%6S\CF?"r:   c                    > T" U S   5      $ Nr   r2   kvr   s    r;   r   r      s    K`acdeafKgr:   key)1trueyes0Fz/Warning: No suitable image generated for slide .z/Warning: No visualization suggestion for slide rV   us  
Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

**Context**: {context}

The MCQ must include:
- A **question** related to the context
- Exactly **4 answer options**
- A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

⚠️ **Critical Requirements**:
- ✅ Return **only valid JSON** — no explanations, headers, or extra text.
- ✅ Ensure all fields and options are enclosed in **double quotes (`"`)**.
- ✅ Do **not** use letters like "A.", "B." in the options — just the plain text.
- Dont give Here is the MCQ while generating MCQ
- Directly follow the format given below

The final output **must strictly follow** this format:
```json
{{
    "question": "<The MCQ question>",
    "options": [
        "<Option 1>",
        "<Option 2>",
        "<Option 3>",
        "<Option 4>"
    ],
    "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"

    Always give "question","options","correct_answer" these labels in double quotes only
}}

c                 f     [         R                  " U 5        g! [         R                   a     gf = f)NTF)r_   r`   rc   )responses    r;   is_valid_json-generate_slide_content.<locals>.is_valid_json,  s.    !JJx(++ ! !s    00c            
        >^^#    [         R                  " T5      m[        SSSS9m0 n [        TR	                  5       5      n/ n[        S[        U5      S5       H  n[        U 5      T:  a    O/ n[        U[        US-   [        U5      5      5       HC  nTX      nUR                  UR                   SSR                  UR                  5       35        ME     S	R                  U5      nUUU4S
 jn[        U5      T:  d  M  U" Xs5      n	UR                  U	5        M     TR                  " USS06I S h  vN n
U
 H8  n[        U[        5      (       d  M  U(       d  M#  S[        U 5      S-    3nXU'   M:     U $  ND7f)Nrz   r{   r|   r}   r      : rV   
c                   >^ #     TR                   " 5       R                  S U UU4S j5      I S h  vN n[        US5      (       a  UR                  nO[	        U5      nUR                  S5      nUS:w  ab  SnSn[        X4S  US9 H)  u  pxUS:X  a  US-  nM  US:X  d  M  US-  nUS:X  d  M'  Un  O   US:w  a  X4US-    n	[        R                  " U	5      n
Og g U	(       a  U	R                  5       (       d  g U
R                  S	S
5      U
R                  S/ 5      U
R                  SS
5      S.nUS	   (       a  [        US   5      S:X  a  US   (       a  U$ g  GN$! [         a
  n S nAg S nAff = f7f)Nc                  L   > [        U 4S j5      T-  T-  R                  0 5      $ )Nc                    > ST0$ )Nr   r2   )r   ctx_texts    r;   r   tgenerate_slide_content.<locals>.generate_mcqs_async.<locals>.generate_single_mcq.<locals>.<lambda>.<locals>.<lambda>P  s
    i=Rr:   )r   invoke)r   mcq_llm
mcq_prompts   r;   r   bgenerate_slide_content.<locals>.generate_mcqs_async.<locals>.generate_single_mcq.<locals>.<lambda>O  s,    $23R$S&0%1&-%. #)&*	)-r:   content{r   startr   }r@   rY   rA   rB   )r@   rA   rB      )get_event_looprun_in_executorhasattrr   r7   find	enumerater_   r`   r^   getrv   rd   )r   idx
mcq_resultr   r   open_bracesendidx_charcharcontent_onlyrg   formatted_mcqrh   asyncior   r   s   `            r;   generate_single_mcqPgenerate_slide_content.<locals>.generate_mcqs_async.<locals>.generate_single_mcqK  s{    3(/6/E/E/G/W/W $!-0 *J  'z9==*4*<*<*-j/$+LL$5E${./&(6?W\6]NH'+s{(3q(8)-(3q(8+6!+;2:C,1 7^ $'"93:Q3GL26**\2JK+/'+#/|7I7I7K7K'+ -8OOJ,K+6??9b+I2=//BRTV2W-M !.j 9 #M)$< = B -.> ?'4 4#'a*b  ) (#'(s^   E.*E EA*E 	E ,*E E.E 4E.5AE E.E 
E+!E.&E++E.return_exceptionsTmcq_r   )r   from_templater   r[   keysrangerv   minappendr.   r\   r/   gatherrZ   dict)rN   
slide_keystasksicontext_slidesjr   r   r   taskresultsrf   mcq_keyr   r   r   mcq_templatenum_mcqsordered_slidess                @@r;   generate_mcqs_async3generate_slide_content.<locals>.generate_mcqs_async5  sg    /==lK
$5& # !."5"5"78
q#j/15A4yH,%'N"1c!a%Z&AB .z} =&--1A1A0B"SXXeN^N^E_D`.ab C $(99^#<L4(l 5zH,2<CT*E 6H !( N NN%F!&$//FF$(TQ"8(.W &
  Os$   C:E<2E<5E:6E<E<E<r@   rA   rB   r   rG   rI    Error generating slide content: )2   ).r   r[   docstore_dictvaluesrv   sumnplinspacerJ   r   r   r   r   zipextendr\   r   rD   r
   from_llmr   r   r   r7   r   sortedrG   itemsrZ   r^   r0   r%   generate_slide_imager1   printr.   r/   rx   r   runr   r   rP   r   rd   ):vectors	client_idr   r  is_imageis_questionquestion_positionGPUr   
all_chunkstotal_chunkstotal_possibleselected_chunksindicesr   cword_countstotal_wordsw	raw_allocr   allocdiffchunkcountslide_content_template
raw_parserslide_content_promptslide_content_chainrf   is_image_bool	slide_keyslide_content
image_pathrI   r   text_contentr   r  rN   r   interleaved_contentmcq_countertotal_slides
total_mcqsintervalr   r   storigo_contentstorigo_content_without_mcrh   r   r   r   r   r  r  r   s:      `                                               @@@@@@@r;   generate_slide_contentr;  z   s   }E-
 '**00779:
:	F U*UU %kk!\A%5:SQG6=>g!}gO>O@JK
13q~~3356
KKk*K?JK{![:5{IK/89y!SE!H%yE9 E
*DA!)!8!HMHAIDAX%(Q,!HMHAIDUl* !) !$J 6u&&w7 !7 {{"\{PZ?["\\"6 *.I
#,,JCH1??@VW
 27'Q
 ##   	 %++b
,ST		#C 		# fV]]%8%8%:@ghih$$$MM#&&$NN,0IIM!M,:,@,@,B(	= 99!8!M!M%>>%"J ".8+ OPY{Z[\].2+KI;VWXY*.M' -C  "0!6!6!8&*# "9 #**,E#../q%:J:J1K0LAeNlNlMmnL<55K -  LB! V Vp ;;245D {{}"%j/!2!CHHS^4L3MQsScOdNef|L99 % #%K%L!J A%9Ca<:5\&/
&;NC5CI5N'	2a8+q0[:5M$(q(9":g7;G}+G4#q( '< #7>Q_j"k&& ",I5CI5N'	2 ", $)#4K $[1_$56G37='0 $5 #7>Q_j"k&&)7~[f)g&--G	 ?KK9@	  E:3q6(CDDEso   BT S<T &-TT $T6T <TA T =K4T 5T =/T -AT 0T <T 
T6T11T6c                 8    [        U 5      nUR                  5       $ zLoad PDF and return documents.)r   load	file_pathloaders     r;   load_pdfrB    s     #F;;=r:   c                 &    [        S U  5       5      $ )z1Count the total number of words in the documents.c              3   Z   #    U  H!  n[        US    R                  5       5      v   M#     g7fr   N)rv   r   r   s     r;   r   $count_total_words.<locals>.<genexpr>  s&     @4Cs3~&,,.//4   )+)r  )docss    r;   count_total_wordsrI    s    @4@@@r:   c                     [        USS9nUR                  U  Vs/ s H  o3R                  PM     sn5      n[        S5        U$ s  snf @Splits the text into semantic chunks using the given embeddings.
percentilebreakpoint_threshold_type%Documents split into semantic chunks.r   create_documentsr   r  rH  
embeddingstext_splitterr   	documentss        r;    split_text_with_semantic_chunkerrW    sK    #lM ..D/QDS0@0@D/QRI	
12 0R   Ac                 8  #    [        SS9 IS h  vN nUR                  U S9I S h  vN n[        UR                  5        [	        U 5      nUR
                  UR                  R                  SS5      -   nUR                  S5      (       d  US-  n[        US5       nUR                  UR                  5        S S S 5        [        S5        [        U5        UsS S S 5      IS h  vN   $  N N! , (       d  f       N;= f N! , IS h  vN  (       d  f       g = f7f)	NT)verbose)url/r   .txtr%  z	HELLO jII)r	   arunr  markdownr   netlocpathr]   endswithopenwrite)filecrawlerrf   
parsed_urlfilenametxt_files         r;   	crawlerrrrj    s     t,,|||-- 	foo d^
$$z'>'>sC'HH  ((H (C HNN6??+ !kh% -,,- !  -,,,sm   DC)DD C+A4D C-8D D#C>$D+D -
C;	7D >D DD	DDc                 Z    [        S5        U (       a$  [        R                  R                  U 5      (       d  [        SU  S35        g[	        U SSS9 nUR                  5       nS S S 5        W(       a  U$ S$ ! , (       d  f       N= f! [         a  n[        SU 35         S nAgS nAff = f)	NStartedzError: The file 'z*' does not exist or the path is incorrect.rY   rutf-8encodingzError reading the file: )r  osra  isfilerc  readrd   )inputre  r   rh   s       r;   read_file_urlru    s    iBGGNN511%eW,VWX %w/4iikG 0
 "w)r) 0/  (,-s<   AB B A7$B 5B 7
BB 
B*B%%B*c                     Sn[        SSS9n[        S/US9nX2-  nUR                  SU 05      n[        U5        UR                  n[        [        U5      5        U$ )Nak  
    Extract only the meaningful content from the text below. Focus on descriptions, value propositions, mission statements,
    features, and anything that provides valuable information about the company, products, or services. Ignore any URLs,
    navigation links, contact forms, or irrelevant sections.

    Here is the content to process:

    {context}
    rz   r{   r~   r   r   )input_variablestemplate)r   r   r   r  r   r+   )r   prompt_templater   promptrunnablefiltered_contents         r;   clean_using_llmr~    ss    O 'C YK/RF |H  	7';<	
'//	$
 !r:   c                    [        USS9n[        U [        5      (       a  U /n [        S[	        U 5       35        [        SU (       a  U S   OS 35        [        U S   [        5      (       a  U  Vs/ s H  nSU0PM	     n n[        S U  5       5      (       d  [        S	5        / $ UR                  U  Vs/ s H  o3S   PM	     sn5      n[        S
5        [        U5        U$ s  snf s  snf )rL  rM  rN  zType of docs after conversion: zFirst item in docs: r   z
Empty listr   c              3   Z   #    U  H!  n[        U[        5      =(       a    S U;   v   M#     g7frE  )rZ   r   r   s     r;   r   ;split_text_with_semantic_chunker_for_url.<locals>.<genexpr>.  s$     O$3z#t$>3)>>$rG  z"Error: Invalid document structure.rP  )r   rZ   r7   r  r+   allrR  rS  s        r;   (split_text_with_semantic_chunker_for_urlr    s    #lM
 $v 
+DJ<
89	 Dal C
DE $q'3156#%6 O$OOO23	 ..t/TtN0Ct/TUI	
12	) 7 0Us   0C 3C%c                    [         R                  R                  U5      (       d  [         R                  " U5        [	        U 5       Hj  u  p#SUS-    S3n[         R                  R                  X5      n[        USSS9 nUR                  UR                  5        SSS5        [        SU 35        Ml     g! , (       d  f       N!= f)	zBSaves each document in the documents list as a separate .txt file.document_part_r   r]  r%  rn  ro  NzSaved: )
rq  ra  existsmakedirsr   r\   rc  rd  r   r  )rV  
output_dirr   document	file_namer@  re  s          r;   save_documents_to_txtr  8  s    77>>*%%
J +$QqSE.	GGLL7	)S73tJJx,,- 4 		{#$ , 43s   >B55
C	c           	         [        U5      n[        R                  R                  SU5      n[        R                  " USS9  [        SS9n[        U SS9 H  u  pE[        R                  R                  U5      n[        R                  " USS9  [        R                  " U/US9n[        R                  R                  US	U 35      nUR                  U5        [        S
U SU SU 35        M     g Nmy_embeddingsTexist_okr&   r   r   r   	embeddingfaiss_indexz(Saved FAISS embedding for document part z as faiss_indexz in r7   rq  ra  r\   r  r   r   r   from_documents
save_localr  	split_documentsr  embedding_folder_baserT  r   r   embedding_foldertemp_dbembedding_file_paths	            r;   create_and_save_embeddingsr  G      II GGLL)D KK%5 "(:;J oQ777<<(=> 	$t4 &&u
C !ggll+;{3%=PQ./8_SEQUVfUghi 8r:   c                     [        U5      n[        SS9n[        R                  " XS9nSnUR	                  U5        [        SU 35        U$ )Nr&   r  r  faiss_supplier_indexu3   ✅ Created FAISS vectorstore in memory for client )r7   r   r   r  r  r  )r  r  rT  vectorstorefaiss_index_paths        r;   create_embeddingsr  e  sU    II "(:;J &&MK-+,	?	{
KLr:   c                 D   Uc	  [        SS9n[        R                  " USS9  / n[        U 5       H  u  pV[        R                  R                  USU 35      n[        R                  " USS9  [        R                  " U/U5      nUR                  U5        UR                  U5        [        [        R                  R                  US5      S5       n	[        R                  " Xi5        SSS5        M     [        S	[        U 5       S
U S35        U$ ! , (       d  f       M  = f)au  
Save each chunk in its own FAISS vector store directory.

Args:
    documents (List[Document]): List of LangChain Document objects.
    base_path (str): Base directory to store all FAISS chunks.
    embedding_model: Optional embedding model instance.
    api_key (str): Required if embedding_model is not passed.

Returns:
    List[str]: List of FAISS chunk folder paths.
Nr&   r  Tr  chunk_zdoc_metadata.pklwbu
   ✅ Saved z( chunks as individual FAISS indexes in 'rW   )r   rq  r  r   ra  r\   r   r  r  r   rc  pickledumpr  rv   )
rV  	base_pathembedding_modelapi_keychunk_pathsr   r   	chunk_dirvector_storefs
             r;   save_faiss_per_chunkr  u  s     *1CDKK	D)KI&GGLLfQCL9	
I- ++SE?C 		*9% "'',,y*<=tDKK ED ' 
Js9~&&NykYZ
[\	 EDs   D
D	c           	         [        U5      n[        R                  R                  SU5      n[        R                  " USS9  [        SS9n[        U SS9 H  u  pE[        R                  R                  U5      n[        R                  " USS9  [        R                  " U/US9n[        R                  R                  US	U 35      nUR                  U5        [        S
U SU SU 35        M     g r  r  r  s	            r;   create_and_save_embeddings_newr    r  r:   c                 6   [        SS9nS nU SU  3n[        R                  " U5       Vs/ s H@  nUR                  S5      (       d  M  U[	        S5      S  R                  5       (       d  M>  UPMB     nn[        US S9nU H]  n[        R                  R                  XE5      n[        SU 35        [        R                  " XSS	9n	Uc  U	nML  UR                  U	5        M_     Ub  UR                  U S
35        [        U5        U HG  n[        R                  R                  XE5      n [        R                  " U5        [        SU 35        MI     U$ s  snf ! [          a    [        SU 35         Mm  ["         a  n
[        SU SU
 35         S n
A
M  S n
A
ff = f)Nr&   r  r\  r  c                 8    [        U R                  SS5      5      $ Nr  rY   rJ   r]   r   s    r;   r   "merge_all_faiss1.<locals>.<lambda>  s    QYY}VX=Y9Zr:   r   Loading FAISS index from: Tallow_dangerous_deserialization/merged_faissDeleted FAISS index folder: Folder not found: Error deleting r   )r   rq  listdir
startswithrv   r   r  ra  r\   r  r   
load_local
merge_fromr  shutilrmtreeFileNotFoundErrorOSError)r  r  rT  merged_faissfolder_pathfolderfaiss_folderssorted_folders
faiss_pathcurrent_faissrh   s              r;   merge_all_faiss1r    s   &89JL Kq,K  ZZ446]+ 	06s=7I7J0K0S0S0U 	4   M/Z[N !WW\\+6
*:,78((aef (L ##M2 ! ;-} =>	, !WW\\+6
	7MM*%0=> ! KB ! 	5&zl34 	7OJ<r!566	7s/   E	E,E)$EF3	F<FFc                    [        SS9nS nU SU  3n[        R                  " U5       Vs/ s H@  nUR                  S5      (       d  M  U[	        S5      S  R                  5       (       d  M>  UPMB     nn[        US S9nU H  n[        R                  R                  XH5      n	[        SU	 35        [        R                  " XSS	9n
U
R                  R                  5        Vs/ s H(  nU
R                  R                  U5      R                   PM*     nnUc  U
nM  UR#                  U5        M     Ub  UR%                  U S
35        [        S5        U HG  n[        R                  R                  XH5      n	 [&        R(                  " U	5        [        SU	 35        MI     U$ s  snf s  snf ! [*         a    [        SU	 35         Mr  [,         a  n[        SU	 SU 35         S nAM  S nAff = f)Nr&   r  r\  r  c                 8    [        U R                  SS5      5      $ r  r  r   s    r;   r   !merge_all_faiss.<locals>.<lambda>  s    S=RT9U5Vr:   r   r  Tr  r  z(Merged FAISS index saved as merged_faissr  r  r  r   )r   rq  r  r  rv   r   r  ra  r\   r  r   r  index_to_docstore_idr  r	  searchr   	add_textsr  r  r  r  r  )r  r  rT  r  r  r  faiss_filessorted_filesre  r  r  doc_idcurrent_textsrh   s                 r;   merge_all_faissr    s   !(:;JLKq,KZZ446]+ 	06s=7I7J0K0S0S0U 	4  
 ++VWLWW\\+4
*:,78((aef (5'I'I'P'P'RT'RV '//66v>KK'R 	 T (L ""=1  ;-} =>8: WW\\+4
	7MM*%0=>	  KT& ! 	5&zl34 	7OJ<r!566	7s5   F 	F ,F "/F%6$F**G*	G*G%%G*)YouTubeTranscriptApic                 $   U R                  S5      S   n[        R                  " U5      n[        U5        SnU H  nUSUS   -   -  nM     [	        USSS9 nUR                  U5        S S S 5        [        S	U 35        U$ ! , (       d  f       N= f)
N=r   rY   rV   rp   r%  rn  ro  zTranscript saved to )r   r  get_transcriptr  rc  rd  )youtube_video_urlvideo_idtranscript_text
transcriptr   r  s         r;   
transcriber    s     &&s+A.H*99(CO	/JcAfIo%
  
hg	.!	
 
/ 
 

+,O	 
/	.s   B
Bc                 8    [        U 5      nUR                  5       $ r=  )r   r>  r?  s     r;   load_txtr  +  s     	"F;;=r:   c                     [        USS9nUR                  U  Vs/ s H  o3R                  PM     sn5      n[        S5        U$ s  snf rK  rQ  rS  s        r;   rW  rW  1  sM    #lM ..D/QDS0@0@D/QRI 

12	 0RrX  c                 T    [        [        5       S9nU" U 5      n[        U5      u  p4nU$ )N)artifact_dict)r    r!   r"   )rt  	converterrenderedrp   r   imagess         r;   parsingr  <  s-    +<+>?IH(2ODVKr:   c                 j    [        U 5      nUR                  U5      n[        S5        [        U5        U$ )NCCC)r$   
split_textr  )headers_to_split_onr   markdown_splittermd_header_splitss       r;   marks_splitterr  B  s5    23FG(33G<	%L	
r:   c                 \   / n[        U 5       Hv  u  pE[        US5      (       a  UR                  nOA[        US5      (       a  UR                  nO#[	        U[
        5      (       a  UnO[        U5      nUR                  XF45        Mx     U VVs/ s H  u  pG[        U5      U:  d  M  XG4PM     nnnU(       d  0 $ [        S U 5       5      n	0 n
UnU H2  u  p[        U5      U	-  n[        S[        X-  5      5      nXU'   X-  nM4     US:  a6  [        XR                  S9nX   S:  a  X==   S-  ss'   US-  nOOUS:  a  M6  US:  a]  [        US SS	9n[        [        U[        U5      5      5       H'  nUU   S   nU
R                  US5      S-   X'   US-  nM)     US:  a  M]  U
$ s  snnf )
aE  
Allocate slides based on chunk size (works on Document-like objects).

Args:
    chunks: List of Document (or string) objects
    total_slides: Total number of slides to generate
    min_chars: Minimum character count to consider a chunk valid

Returns:
    Dict mapping original chunk index to number of slides to generate
r   r   c              3   <   #    U  H  u  p[        U5      v   M     g 7fr   rv   )r   r   txts      r;   r   "allocate_slides.<locals>.<genexpr>h  s     3U61c#hhUs   r   r   r   c                     [        U S   5      $ r   r  r   s    r;   r   !allocate_slides.<locals>.<lambda>  s    S1Yr:   T)r   reverse)r   r   r   r   rZ   r7   r   rv   r  r   r   r   r  r   r   )chunksr6  	min_chars	extractedr   r)  rp   r  validtotal_charsallocations	remainingr   propcntmax_idxsorted_by_sizes                    r;   allocate_slidesr  I  s    If%5.))%%DUI&&==Ds##Du:D!# & %.GI&!SY1FXaXIEG	 3U33K KI3x+%!U4./0C		  a-k7!# A% NI a- a-+>Ms9c.&9:;A #A&C*sA6:KNI < a- G Hs   F(&F(c                   H    \ rS rSrSrS rS rS rS rS r	S r
S	 rS
 rSrg)SlideCollectioni  z2Container class for slides with dict-like behaviorc                     0 U l         g r   rG   re   s    r;   __init__SlideCollection.__init__  s	    r:   c                      X R                   U'   g r   r  )re   r   r   s      r;   	add_slideSlideCollection.add_slide  s    "Cr:   c                 6    U R                   R                  5       $ r   )rG   r   r  s    r;   r   SlideCollection.keys  s    {{!!r:   c                 ,    [        U R                  5      $ r   )iterrG   r  s    r;   __iter__SlideCollection.__iter__      DKK  r:   c                      U R                   U   $ r   r  )re   r   s     r;   __getitem__SlideCollection.__getitem__  s    {{3r:   c                 6    U R                   R                  5       $ r   )rG   r  r  s    r;   r  SlideCollection.items  s    {{  ""r:   c                 6    U R                   R                  5       $ r   )rG   r  r  s    r;   r  SlideCollection.values  s    {{!!##r:   c                 ,    [        U R                  5      $ r   )reprrG   r  s    r;   __repr__SlideCollection.__repr__  r  r:   r  N)r3   r4   r5   r6   __doc__r  r  r   r  r  r  r  r"  r9   r2   r:   r;   r	  r	    s-    <#"! #$!r:   r	  returnc                    [        U S5      (       a  U R                  nO[        U 5      nUR                  5       nSU;   d  SU;   a5  UR	                  S5      nUR                  S5      S-   nUS:w  a
  US:w  a  XU n[        R                  " S	S
U5      n [        R                  " U5      nSU;   a  [        R                  " US   5      $  U$ !    U$ = f)Nr   zHere's another attemptzI apologizer   r   r   r   r   z	'(\w+)\":z"\1":
properties)r   r   r7   r^   r   rfindrt   subr_   r`   dumps)
ai_messagerp   r   r   parseds        r;   quick_json_fixr-    s    z9%%!!:::<D  4'=D+@		#jjo!B;3!8c?D 66,$/DD!6!::f\233 "
 KKs   4C Cc                   ^0^1  Sn[        [        S9m1[        R                  " U5      n	[	        SSS9n
S S U14S jS	.U	-  U
-  T1-  n[        5       nS
n[        U5       H  nX   nX   n[        US[        US[        U5      5      5      nUR                  UUS.5      n[        US5      (       a  UR                  nO5[        US5      (       a  UR                  5       OUnUR                  S0 5      n[        U[        5      (       a  SU;   a  UR!                  S5        [        UR#                  5       S S9 H   nUR%                  SU 3UU   5        US
-  nM"     M     S[        4S jm0[        [        UR                  R'                  5       U04S jS95      nUUl        U(       Ga  UR                  R'                  5        H  u  nn[        U[        5      (       a  [)        S&0 UD6nOUnUR*                  (       a  [,        R.                  " UR*                  U5      nU(       a2  [        U[        5      (       a  UUR                  U   S'   M  UUl        M  [        U[        5      (       a  S UR                  U   S'   M  S Ul        M  [        U[        5      (       a  S UR                  U   S'   M  S Ul        M     O]UR                   HM  n[        UR                  U   [        5      (       a  S UR                  U   S'   M9  S UR                  U   l        MO     U(       Gac  Sn[        [2        S9n[        R                  " U5      n[5        [6        5      n[8        R:                  " U
US9n0 n[=        UR                  R#                  5       5      n [?        S[A        U 5      S5       GH  n![A        U5      U:  d  M  / n"[?        U![C        U!S-   [A        U 5      5      5       H  n#U U#   n$UR                  U$   n%[        U%[        5      (       a%  U%R                  SS5      n&U%R                  S/ 5      n'OU%RD                  n&U%RF                  n'U"RI                  U& SS RK                  U'5       35        M     S!RK                  U"5      n( UU
-  U-  U-  R                  U(URM                  5       S".5      n)U)US#[A        U5      S
-    3'   GM     0 n+Sn,US
:X  ae  US:  a  X#-  OUn-[Q        U 5       HI  u  n.nUR                  U   U+U'   U.S
-   U--  S:X  d  M&  U,U:  d  M.  S#U,S
-    3n/U/U;   a  UU/   U+U/'   U,S
-  n,MK     OEU  H  nUR                  U   U+U'   M     [?        U5       H  n,S#U,S
-    3n/U/U;   d  M  UU/   U+U/'   M     [S        U+SS$9$ [        UR                  SS$9$ ! [N         a  n* S n*A*GM  S n*A*ff = f! [N         a  n*[O        S%[        U*5       35      eS n*A*ff = f)'Na  
Based on the following context, generate professional and engaging content for exactly {num_slides} slides in a Storigos presentation.

Each slide must include:
- A clear and concise **sub-heading**
- **Paragraphs** that effectively communicate the key ideas and insights
- A specific, concise **visualization suggestion**

**Context**: {query}

Focus on creating content that is both informative and engaging. Ensure each slide:
- Has a well-structured sub-heading that captures the main point
- Uses clear and concise paragraphs to communicate important information

Use a professional and creative tone throughout. Each slide should incorporate the following elements where appropriate:
- **Thought-provoking questions** to encourage reflection
- **Relevant statistics** or data points that add credibility
- **Industry insights** or emerging trends to demonstrate expertise
- **Practical examples** or case studies to illustrate key concepts
- **Calls to action** to guide the audience toward specific actions or takeaways

For the visualization suggestion:
- Provide a clear and specific description of an image that would be relevant to the slide content.
- Keep it very concise, using a maximum of 5 words.
- Focus on concrete objects, scenes, or concepts that can be easily visualized.
- Avoid abstract or overly complex ideas.
- Include the context of the topic (e.g., "Python programming logo" instead of just "Python logo").

Make sure all content is drawn exclusively from the provided context or embedded data. Avoid introducing external information not found in the source material.

{format_instructions}

CRITICAL: The output must be a valid JSON object with this EXACT structure:
{{
  "slides": {{
    "slide_1": {{
      "type": "flash",
      "subheading": "...",
      "paragraphs": ["...", "..."],
      "visualization_suggestion": "...",
      "image": null
    }},
    "slide_2": {{ ... }}
  }},
  "token_count": 0
}}

DO NOT put "token_count" inside the "slides" object. It must be at the root level.
DO NOT include any explanations or additional text - only the JSON object.
The final output must be in strict sequential order: "slide_1", "slide_2", ..., up to "slide_{num_slides}".
r   rz   r{   rw  c                     U S   $ )Nr   r2   r   s    r;   r   /generate_slide_content_alloc1.<locals>.<lambda>  s    1W:r:   c                     U S   $ r   r2   r   s    r;   r   r0    r   r:   c                 $   > TR                  5       $ r   r   r   s    r;   r   r0    r   r:   )r   r   r   r   r   r   r   rG   
model_dumprI   c                 <    [        U R                  S5      S   5      $ )Nr   r   )rJ   r   )ks    r;   r   r0  "  s    c!''RU,WX/FZr:   r   slide_r   c                     U R                  SS5      u  pUS:X  a  SnOUS:X  a  SnOSnUR                  5       (       a  [        U5      OSnX44$ r   r   r   s        r;   r   <generate_slide_content_alloc1.<locals>.custom_slide_sort_key&  r   r:   c                    > T" U S   5      $ r   r2   r   s    r;   r   r0  1  s    OdeghiejOkr:   r1   u  
Based on the following context from the last two slides, generate one multiple-choice question (MCQ). The question should be relevant to the content and designed to test comprehension.

**Context**: {context}

The MCQ must include:
- A **question** related to the context
- Exactly **4 answer options**
- A clear indication of the **correct answer** as a single letter: 'a', 'b', 'c', or 'd'

⚠️ **Critical Requirements**:
- ✅ Return **only valid JSON** — no explanations, headers, or extra text.
- ✅ Ensure all fields and options are enclosed in **double quotes (`"`)**.
- ✅ Do **not** use letters like "A.", "B." in the options — just the plain text.
- Directly follow the format given below

The final output **must strictly follow** this format:
```json
{{
    "question": "<The MCQ question>",
    "options": [
        "<Option 1>",
        "<Option 2>",
        "<Option 3>",
        "<Option 4>"
    ],
    "correct_answer": "<Correct option (e.g., 'a', 'b', 'c', or 'd')>"
}}

{format_instructions}
)r   r   r   r   r.   rY   r/   r   rV   r   )r   r   r   r  r  r2   )*r   rD   r   r   r   r	  r  getattrr7   r   r   rG   r3  r   rZ   r   popr   r  r  r(   r0   r%   r  r1   r=   r   r-  r
   r  r[   r   rv   r   r.   r/   r   r\   r   rd   r   rP   )2r  r  r   r  r  r  r  r  r+  r-  r   r.  
all_slidescounter	chunk_idxnr)  r   rf   slide_itemsrawr0  r  r1  	slide_objr2  r  
mcq_parserr   
json_fixeroutput_fixing_parserrN   r   r   r   r   r   r   titleparasr   r   rh   r4  r5  r8  r   r   r   r   s2                                                   @@r;   generate_slide_content_alloc1rH    s   {E2"h &nE1??@VW-
 .7'Q
 ##   	 %&
,I&A%EE>75)SQVZ3XYE(//%q0QRF vx(($mm-4V\-J-Jf'')PV!ggh3 +t,,+1M. $K$4$4$6<Z[	$$vgY%7Y9OP1 \' -.		#C 		# fZ%6%6%<%<%>Dklm*
 ,6,=,=,C,C,E(	=mT22 , =} =I -I55!8!M!M!::!"J "%mT::DNJ--i8A2<M/%mT::DHJ--i8A26M/!-66@D
)))4W=.2+5 -F8 (..	j//	:DAA<@J%%i099=J%%i06	 / L> .jIJ+99,GJ'7J#5#>#>3z#Z Dj//4467J1c*oq1t9x'%'N"1c!a%Z&AB(m * 1 1# 6%eT22$)IIlB$?E$)IIlB$?E$)$4$4E$)$4$4E&--r#((5/9J.KL C $(99^#<L!&!"() 33 !&'33=3U3U3W"  # 8BtCIM?349 2B #%K A%5=\:1z&/
&;NC5?5F5Fy5Q'	2a8+q0[85K$(q(9":"d?;?=/8#q( '< ",I5?5F5Fy5Q'	2 ", $)?K $[1_$56G$7;G}+G4 $3
 (/BPQRR!):):JJ; % ! !>  E:3q6(CDDEs]   N3V2 ;CV2 ?VAV2 V2 AV2 /V2 V2 
V/#V2 *V//V2 2
W<WWc           
         [        U 5      n[        SSSS9 nUR                  U5        S S S 5        / SQn	[        SSSS9 nUR                  5       n
S S S 5        [	        U	W
5      n[        S5        [        XSS	9n[        S
[        U5       35        [        S[        U5       35        [        S5        [        XXX4XV5      n[        U5        g ! , (       d  f       N= f! , (       d  f       N= f)Nzparse_data.mdr%  rn  ro  ))#zHeader 1)z##zHeader 2)z###zHeader 3)z####zHeader 4rm  Chunksd   )r  zTotal chunks: zValid chunks: z
Slide allocation:)	r  rc  rd  rs  r  r  r  rv   rH  )rt  r   r  r  r  r  r  
parse_datar  r  r   r  
allocationr1  s                 r;   mainrO    s    J	osW	5	
 
6 
osW	5&&( 
6/9F	(O sCJ	N3v;-
()	N3z?+
,-	
 1&JYa  qB  GM	-' 
6	5 
6	5s   C C
C
C,__main__z-Chapter3-Basic-Requirement-in-the-Kitchen.pdfztemp/output_embeddingsi\B   T   r   )faiss_chunksNN)r  )rE   )srq  timerandomrt   r_   heapq
jsonschemar   r   langchain_core.messagesr   requestsnumpyr  langchain_ollamar    langchain_community.vectorstoresr   langchain_core.promptsr   crawl4air	   r   langchain.output_parsersr
   r   langchain_core.output_parsersr   r   langchain_core.runnablesr   $langchain_community.document_loadersr   r   r   r   PyPDF2r   $langchain_experimental.text_splitterr   pydanticr   r   typingr   r   r   urllib.parser   r   r   r  pptxr   r   r   marker.converters.pdfr    marker.modelsr!   marker.outputr"   pathlibr#   langchain_text_splittersr$   r%   OLLAMA_MODELr(   r=   rD   rL   rP   rS   rr   rx   r;  rB  rI  rW  rj  ru  r~  r  r  r  r  r  r  r  r  r  youtube_transcript_apir  r  r  r  r  r  r	  r7   r-  rH  rO  r3   rt  r  r  r  r   r  r  r  r  r  r2   r:   r;   <module>rq     s   	   	   0 -   - 2 5 $  7 1 O 8 v v  @ % ' ' !  3   & ' . + ,  ? 7 ;!U9 Ue eaY aa	 aa9 a?* ?4E~E@
A,.><%j< #Jj@.`*` 8	=D! !6# :|E|. z;EJ/IJHHK	
Cz(H[:KCP r:   