o
    AhJ{                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlZd dlmZ eG dd dZeG dd	 d	ZeG d
d dZG dd dZG dd dZdd Zedkrhe  dS dS )    N)	dataclassasdict)ListDictAnyOptionalTuple)Counter)Pathc                   @   s^   e Zd ZU dZeed< eed< ee ed< ee ed< ee ed< ee	e	f ed< eed< d	S )
SlideContentz&Structure for individual slide contentslide_numbertitlemain_pointssupporting_details
key_quotestimestamp_range
slide_typeN)
__name__
__module____qualname____doc__int__annotations__strr   r   float r   r   W/var/www/eduai.edurigo.com/doc_train/edurigo_ai/Puru/youtube_slides/content_analyzer.pyr      s   
 r   c                   @   s   e Zd ZU dZeed< eed< eed< ee ed< ee ed< eed< e	ed< eed	< e
eef ed
< eed< dZee ed< dS )AnalyzedContentz4Complete analyzed content ready for slide generationvideo_idvideo_title
main_topic
key_themesslidestotal_slidesestimated_presentation_timecontent_summaryanalysis_metadatasuccessNerror_message)r   r   r   r   r   r   r   r   r   r   r   r   boolr(   r   r   r   r   r   r      s   
 r   c                   @   sv   e Zd ZU dZeed< eed< eed< eed< eed< eed< dZ	eed	< dZ
eed
< dZeed< dZee ed< dS )SavedTranscriptz/Structure to match your saved transcript formatr   
transcriptdurationlanguage
word_countr'    r   extraction_methodextraction_timestampNr(   )r   r   r   r   r   r   r   r   r)   r   r0   r1   r(   r   r   r   r   r   r*   '   s   
 r*   c                   @   sj  e Zd ZdZd6dee dedefdd	Zd
edefddZd
ede	e fddZ
d7d
edede	e fddZd8d
ededefddZd9d
edede	e fddZdededefddZd e	e d!ede	e	e  fd"d#Zd$e	e d%ed&edefd'd(Zd:d$e	e d*ede	e fd+d,Zd$e	e d-e	e de	e fd.d/Zd;d$e	e d1ede	e fd2d3Zdefd4d5ZdS )<YouTubeContentAnalyzerzK
    Analyzes extracted content and structures it for slide generation
    N      target_slides
min_slides
max_slidesc                 C   s    || _ || _|| _h d| _d S )N>I   aianatbebydoheinisitmemyofonortoupusweandarebutcandidforhadhasherhimhisitsmayourshethewasyoubeendoesfromhaveintomustthatthemtheythisuponwerewillwithyouraboutaboveafteramongbeingbelowcouldmighttheirthesethosewouldbeforeduringshouldbetweendespitethroughtowards
throughout)r5   r6   r7   
stop_words)selfr5   r6   r7   r   r   r   __init__;   s   zYouTubeContentAnalyzer.__init__textreturnc                 C   s|   |sdS t dd|}t dd|}t dd|}t dd|}t jdd|t jd}t dd	|}t d
d	|}| }|S )zClean and normalize textr/   u   [♪♫🎵🎶]z\[.*?\]z\(.*?\)z<\d+:\d+:\d+\.\d+>zKind:\s*\w+\s*Language:\s*\w+flagsz\b\d+\b z\s+)resub
IGNORECASEstrip)r   r   r   r   r   
clean_textK   s   z!YouTubeContentAnalyzer.clean_textc                 C   s   |sg S |  |}td|}g }|D ]}| }t|dkr+| ds+|| qtdt| dt| d |rKtd|d d	d
  d |S )zExtract sentences from textz[.!?]+
   )zkind:z	language:zDebug: Extracted z sentences from  characterszDebug: First sentence: r   Nd   ...)	r   r   splitr   lenlower
startswithappendprint)r   r   r   	sentencesclean_sentencessentencer   r   r   extract_sentencesd   s   

z(YouTubeContentAnalyzer.extract_sentencesr   top_nc                    s|    | }td|} fdd|D }t|}dd ||d D }g }| }	tt|	d D ]a}
|	|
  d|	|
d   }t|dkr^|	|
  j	vr^|	|
d   j	vr^|
| |
t|	d k r|	|
  d|	|
d   d|	|
d   }t|d	kr|	|
  j	vr|	|
d   j	vr|
| q4t|}d
d ||D }tt|d|d  |d|d   }|d| S )z(Extract key phrases and topics from text\b[a-zA-Z]{3,}\bc                       g | ]	}| j vr|qS r   r   .0wordr   r   r   
<listcomp>       z>YouTubeContentAnalyzer.extract_key_phrases.<locals>.<listcomp>c                 S      g | ]\}}|qS r   r   )r   r   countr   r   r   r                r      r   c                 S   r   r   r   )r   phraser   r   r   r   r      r   N)r   r   r   findallr	   most_commonr   ranger   r   r   listset)r   r   r   r   wordsfiltered_words	word_freq	key_wordsphrases
words_listr9   r   phrase_freqtop_phraseskey_phrasesr   r   r   extract_key_phrases}   s0   
(
(z*YouTubeContentAnalyzer.extract_key_phrasesr/   r   c                    s   |rt d| } fdd|D }ng } j|dd}|r8d|dd  }|r4|d	|d
  7 }| S |rD|d
  }| S d}| S )z&Identify the main topic of the contentr   c                    r   r   r   r   r   r   r   r      r   z>YouTubeContentAnalyzer.identify_main_topic.<locals>.<listcomp>   r   r   Nr3    - r   zContent Analysis)r   r   r   r   joinr   )r   r   r   title_wordstitle_keywordsr   r    r   r   r   identify_main_topic   s   z*YouTubeContentAnalyzer.identify_main_topicr   
num_themesc           
      C   s   |  |}g }| j||d d}|d| D ]/}g }|D ]}| | v r9|t|dkr6|dd d n| q|rE| }	||	 q|sLg d}|d| S )z$Extract main themes from the contentr   r   Nr   r   )IntroductionzMain Contentz
Key Points
Conclusion)r   r   r   r   r   r   )
r   r   r   r   themesr   r   related_sentencesr   theme_titler   r   r   extract_key_themes   s    
&
z)YouTubeContentAnalyzer.extract_key_themescontent_lengthr,   c                 C   sh   | j rt| jt| j| j S tdtd|d }tdtd|d }t|| d }t| jt| j|S )z3Determine optimal number of slides based on contentr3   r4   -   i  r   )r5   maxr6   minr7   r   )r   r   r,   
time_based
word_basedoptimal_slidesr   r   r   determine_slide_count   s   z,YouTubeContentAnalyzer.determine_slide_countr   
num_slidesc                 C   s   |sg S t dt|| }g }g }t|D ]*\}}|| t||kr,|t|d k s4|t|d kr?|r?||  g }qt||krkt|dkrkt|dkr_|d |d  |  t||krkt|dksL|S )z'Segment content into slide-sized chunksr   r   )r   r   	enumerater   copyextendpop)r   r   r   sentences_per_slidesegmentscurrent_segmentr9   r   r   r   r   segment_content   s$   
,z&YouTubeContentAnalyzer.segment_contentcontent_segmentr   r#   c                 C   s   |sd| S |dkrdS ||krdS d |dd }| j|dd	}|r;|d
  }t|dkr9|dd d }|S d|d  S )z/Generate title for a slide based on its contentzSlide r   r   r   r   Nr   r3   r   r   2   /   r   z
Key Point )r   r   r   r   )r   r   r   r#   combined_textr   r   r   r   r   create_slide_title  s   
z)YouTubeContentAnalyzer.create_slide_title   
max_pointsc                 C   s   g }|d| D ]6}|  }tjdd|tjd}|r>t|dkr,|d  |dd  n| }|ds9|d7 }|| q|S )	z*Extract main points from a content segmentNz7^(and|but|so|then|also|furthermore|moreover|however)\s+r/   r   r   r   ).!?r   )r   r   r   r   r   upperendswithr   )r   r   r   pointsr   pointr   r   r   extract_main_points(  s   ,

z*YouTubeContentAnalyzer.extract_main_pointsr   c                 C   s   g }t |}|D ]9}||vrAt|dk rAtd|s2d| v s2d| v s2d| v s2d| v rA| }t|dkrA|| q|S )z6Extract supporting details that complement main pointsr3   z\d+examplezsuch as	includinglike   )r   r   r   searchr   r   r   )r   r   r   detailsused_sentencesr   detailr   r   r   extract_supporting_details@  s   
z1YouTubeContentAnalyzer.extract_supporting_detailsr   
max_quotesc                 C   st   g }|D ]3}t |dkr7t |dk r7d| v s%|ddks%td|r7| }|| t ||kr7 |S q|S )z*Extract memorable quotes or key statements      said"r   z[!.])r   r   r   r   r   r   r   )r   r   r   quotesr   quoter   r   r   extract_key_quotesU  s   

 z)YouTubeContentAnalyzer.extract_key_quotesc           "      C   s  z|j }|j}|j}|j}t|dd| }td|  tdt| d td|  td|dd	  d
 | |}tdt| d | |}|std|dd   |	d}	dd |	D }	|	r{tdt|	 d |	dd }nt
||dg g dddi ddt| dt| dW S tdt| d | ||}
| |}| ||}| ||}tdt| d| d g }t|D ]U\}}|d }|dkrd}n|t|krd }nd!}| ||t|}| |}| ||}| |}|t| }|d | }|| }t|||||||f|d"}|| qd#t| d$t| d%|
  d}t|d& }td'd( |D d) }|| }|t|t||r]td*d( |D t| nd|rg|t| ndtd+j  d,}t
|||
||t||||d-d.
W S  ty }  z4ddl}!td/t|   td0|!   t
t|d1d2d3dg g dddi dd4t|  dW  Y d} ~ S d} ~ ww )5zLMain analysis function that processes extracted content into slide structurer   zVideo Analysis - zDebug: Starting analysis for zDebug: Transcript length: r   zDebug: Video title: zDebug: Transcript preview: N   r   z Debug: Clean transcript length: z6Debug: No sentences extracted. Raw transcript sample: i  r   c                 S   s$   g | ]}t | d kr| qS r   )r   r   )r   sr   r   r   r     s   $ z:YouTubeContentAnalyzer.analyze_content.<locals>.<listcomp>z$Debug: Alternative extraction found z
 sentencesr   r/   r   Fz*No content to analyze. Transcript length: z, Clean length: r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   zDebug: Processing zDebug: Created z content segments for z slidesr   intro
conclusioncontentr   r   r   r   r   r   r   zAnalysis of z sentences across z slides, focusing on g      ?c                 s   s$    | ]}t |jt |j V  qd S N)r   r   r   r   slider   r   r   	<genexpr>  s   " z9YouTubeContentAnalyzer.analyze_content.<locals>.<genexpr>g      ?c                 s   s    | ]}t |jV  qd S r  )r   r   r  r   r   r   r    s    datetime)original_word_countsentences_analyzedslides_createdavg_points_per_slidecontent_densityanalysis_timestampT)
r   r   r    r!   r"   r#   r$   r%   r&   r'   zDebug: Exception occurred: zDebug: Traceback: r   unknownErrorzAnalysis failed: ) r   r+   r,   r.   getattrr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   sum
__import__r	  now	isoformat	Exception	tracebackr   
format_exc)"r   extracted_contentr   r+   r,   r.   r   clean_transcriptr   alt_sentencesr    r!   r   content_segmentsr"   r9   segmentr   r   r   r   r   r   segment_duration
start_timeend_timer  r%   	base_timedetail_timeestimated_timer&   er  r   r   r   analyze_contenti  s   






$"	
z&YouTubeContentAnalyzer.analyze_content)Nr3   r4   r   )r/   )r   )r   )r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r&  r   r   r   r   r2   6   s    ." " r2   c                
   @   s   e Zd ZdZddedefddZded	ee fd
dZd	e	e
eef  fddZddedee ded	ee fddZded	efddZded	ee fddZded	ee fddZdefddZdS )TranscriptAnalysisManagerz%Manages analysis of saved transcriptstranscriptsanalysistranscripts_diranalysis_dirc                 C   s.   t || _t || _| jjdd t | _d S )NT)exist_ok)r
   r*  r+  mkdirr2   analyzer)r   r*  r+  r   r   r   r     s   

z"TranscriptAnalysisManager.__init__r   r   c              
   C   s   zG| j | d }| std|  W dS t|ddd}t|}W d   n1 s.w   Y  td
i |}td| d|j  |W S  tyd } ztd	| d|  W Y d}~dS d}~ww )z Load a specific saved transcriptz_transcript.jsonu   ❌ Transcript file not found: Nrutf-8encodingu   ✅ Loaded transcript for : u!   ❌ Error loading transcript for r   )	r*  existsr   openjsonloadr*   r   r  )r   r   transcript_filefdatar+   r%  r   r   r   load_transcript  s    z)TranscriptAnalysisManager.load_transcriptc                 C   s   g }| j dD ]c}zEt|ddd}t|}W d   n1 s"w   Y  ||dd|dd	|d
d|dd|dd|ddt|d W q tyk } zt	d| d|  W Y d}~qd}~ww t
|dd ddS )z$List all available saved transcriptsz*_transcript.jsonr/  r0  r1  Nr   r  r   Unknownr.   r   r,   r'   Fr0   )r   r   r.   r,   r'   r0   	file_pathzError reading r3  c                 S   s   |  ddS )Nr.   r   )get)xr   r   r   <lambda>3  s    zFTranscriptAnalysisManager.list_available_transcripts.<locals>.<lambda>T)keyreverse)r*  globr5  r6  r7  r   r>  r   r  r   sorted)r   r(  r=  r9  r:  r%  r   r   r   list_available_transcripts  s(   






 z4TranscriptAnalysisManager.list_available_transcriptsNFr5   force_refreshc                 C   s   | j | d }|s$| r$ztd|  | |W S    td Y | |}|r.|js4td dS td| d t|d}||}|jrW| | td	|  |S td
|j	  dS )z#Analyze a specific saved transcript_analysis.jsonu#   📁 Loading existing analysis for u9   ⚠️ Failed to load existing analysis, creating new oneu6   ❌ Cannot analyze: transcript not available or failedNu   🔄 Analyzing transcript for r   r5   u   ✅ Analysis completed for u   ❌ Analysis failed: )
r+  r4  r   load_analysisr;  r'   r2   r&  save_analysisr(   )r   r   r5   rF  analysis_filer+   r.  analyzed_contentr   r   r   analyze_transcript5  s(   





z,TranscriptAnalysisManager.analyze_transcriptrL  c              
   C   s   z7| j |j d }t|}t|ddd}tj||ddd W d   n1 s)w   Y  td	|  W d
S  tyQ } ztd|  W Y d}~dS d}~ww )zSave analysis results to JSONrG  wr0  r1  r   F)indentensure_asciiNu   💾 Analysis saved to: Tu   ❌ Failed to save analysis: )r+  r   r   r5  r6  dumpr   r  )r   rL  rK  analysis_dictr9  r%  r   r   r   rJ  X  s   z'TranscriptAnalysisManager.save_analysisc           
      C   s0  z}| j | d }| sW dS t|ddd}t|}W d   n1 s'w   Y  g }|dg D ]"}t|d |d |d	 |d
 |d t|d |d d}|| q4t	|d |d |d |d ||d |d |d |d |d |dd}|W S  t
y }	 ztd|	  W Y d}	~	dS d}	~	ww )zLoad previously saved analysisrG  Nr/  r0  r1  r"   r   r   r   r   r   r   r   r  r   r   r    r!   r#   r$   r%   r&   r'   r(   r   u   ❌ Error loading analysis: )r+  r4  r5  r6  r7  r>  r   tupler   r   r  r   )
r   r   rK  r9  r:  r"   
slide_datar  rL  r%  r   r   r   rI  j  sL   
	z'TranscriptAnalysisManager.load_analysisc                 C   s   |  |}|r
|jsdS d|j d|j d|j dd|jdd  d|jd	d
}|jD ]}|d|j	 d|j
 dt|j d7 }q.|S )z+Get a formatted summary of analysis resultsNu   
📊 **ANALYSIS SUMMARY: u'   **

🎯 **Overview:**
• Main Topic: u   
• Total Slides: u   
• Key Themes: , r3   u   
• Estimated Time: .1fu#    minutes

📋 **Slide Structure:**z
   . z (z points))rI  r'   r   r    r#   r   r!   r$   r"   r   r   r   r   )r   r   analyzedsummaryr  r   r   r   get_analysis_summary  s"   



(z.TranscriptAnalysisManager.get_analysis_summaryc           	      C   s  |  |}|r
|jstd|  dS td|j d td td|j  td|j  td|jd	d
 tdd|j  td td |j	D ]x}td|j
 d|j d|j d |jrwtd |jD ]	}td|  qm|jrtd |jD ]	}td|  q|jrtd |jD ]
}td| d qt|jd }t|jd }td|d  d|d dd |d  d|d d qPdS )!z.Print detailed analysis with all slide contentu   ❌ No analysis available for Nu   
🎬 **DETAILED ANALYSIS: z**z<============================================================u   🎯 Main Topic: u   📊 Total Slides: u   ⏱️ Estimated Time: rV  z minutesu   🔑 Key Themes: rU  u   
📋 **SLIDE BREAKDOWN:**z(----------------------------------------u   
🎯 **Slide r3  z** ()u      📌 Main Points:u
         • u      📝 Supporting Details:z      - u      💬 Key Quotes:z      "r   r   r   u      ⏰ Timestamp: <   :02dr   )rI  r'   r   r   r    r#   r$   r   r!   r"   r   r   r   r   r   r   r   r   )	r   r   rX  r  r   r   r   timestamp_starttimestamp_endr   r   r   print_detailed_analysis  s<   


"


6z1TranscriptAnalysisManager.print_detailed_analysis)r(  r)  )NF)r   r   r   r   r   r   r   r*   r;  r   r   r   rE  r   r)   r   rM  rJ  rI  rZ  ra  r   r   r   r   r'    s    $#-r'  c            	   
   C   s*  t  } td td |  }t|dD ]4\}}|d rdnd}t| d| d|d	  d
|d   td|d dd|d dd|d   q|sQtd dS dd |D }|r|d d	 }td|  | j|dd}|rtd| |  td| d}| dkr| | dS dS dS dS ) z"Main function to demonstrate usageu   📁 Available Transcripts:z(========================================r   r'   u   ✅u   ❌rW  r   r   r3  r   z
   Words: r.   ,z | Duration: r,   z.0fzs | Method: r0   uJ   ❌ No transcripts found. Make sure you have saved some transcripts first.Nc                 S   s   g | ]}|d  r|qS )r'   r   )r   tr   r   r   r     s    zmain.<locals>.<listcomp>r   u$   
🔄 Analyzing example transcript:    rH  
z
Show detailed analysis for z	? (y/N): y)	r'  r   rE  r   rM  rZ  inputr   ra  )	managerr(  r9   r+   statussuccessful_transcriptsexample_video_idrX  choicer   r   r   main  s0   &,rm  __main__)r   r6  osdataclassesr   r   typingr   r   r   r   r   collectionsr	   mathpathlibr
   r   r   r*   r2   r'  rm  r   r   r   r   r   <module>   s0       I ]'
