o
    ݈h                  
   @   s,  d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ ejjddd\Z Z!e!\Z"Z#Z$Z%Z%dd Z&dd Z'dd Z(dd Z)dd Z*dd Z+dd Z,dd Z-dd Z.eG d d! d!Z/eG d"d# d#Z0eG d$d% d%Z1eG d&d' d'Z2d(d) Z3d*d+ Z4d,ee5 fd-d.Z6d/ee
e5e7f  d0e8d1e7d2e7fd3d4Z9d5e7d6e7d7e7d1e7d0e8f
d8d9Z:d:e
e5e	f d;e8d0e8fd<d=Z;d>ej<d:e
e5e	f d?e8fd@dAZ=dBdBi fdCeeeB  dDe5dEe>fdFdGZ?dS )H    N)	dataclassfield)Path)AnyDictListUnion)IndicNormalizerFactory)EncDecCTCModel)EncDecHybridRNNTCTCModel)logging)tqdm)BLANK_TOKENSPACE_TOKENV_NEGATIVE_NUMzsnakers4/silero-vad
silero_vad)repo_or_dirmodelc                 C   s4   t | j| d  }t d|j}|dd}|S )N_ -)r   partsjoinstemreplace)audio_filepathaudio_filepath_parts_in_utt_idfp_partsutt_id r   X/var/www/eduai.edurigo.com/doc_train/edurigo_ai/Puru/tts/BhasaAnuvaad/utils/data_prep.py_get_utt_id#   s   r!   c                 C   s|   t | d}tdd |D }W d   n1 sw   Y  dd td||D }dd |D }|d || ||fS )	zN
    Get the start and end ids of the lines we will use for each 'batch'.
    rc                 s   s    | ]}d V  qdS )   Nr   ).0r   r   r   r    	<genexpr>0   s    z(get_batch_starts_ends.<locals>.<genexpr>Nc                 S      g | ]}|qS r   r   r$   xr   r   r    
<listcomp>2       z)get_batch_starts_ends.<locals>.<listcomp>r   c                 S   s   g | ]}|d  qS )r#   r   r'   r   r   r    r)   3       )opensumrangepopappend)manifest_filepath
batch_sizefnum_lines_in_manifeststartsendsr   r   r    get_batch_starts_ends*   s   

r7   c                 C   sT   d}t | d}|D ]}t|}||v rd}q
W d   |S 1 s#w   Y  |S )zV
    Returns True if entry is a key in any of the JSON lines in manifest_filepath
    Fr"   TNr,   jsonloads)r1   entryentry_in_manifestr3   linedatar   r   r    is_entry_in_any_lines:   s   

r?   c                 C   s^   t | d }|D ]}t|}||vr W d   dS qW d   dS 1 s(w   Y  dS )zW
    Returns True is entry is a key in all of the JSON lines in manifest_filepath.
    r"   NFTr8   )r1   r;   r3   r=   r>   r   r   r    is_entry_in_all_linesK   s   

r@   c              
   C   sT   t  }z||}W n   |  Y S | tddtjddd } || }|S )z
    Perform NFC -> NFD normalization for a sentence and a given language
    sentence: string
    lang_code: language code in ISO format
     |u   ।۔'-)	r	   get_normalizer	translatestr	maketransstringpunctuationr   	normalize)sentence	lang_codefactory
normalizernormalized_sentencer   r   r    normalize_sentenceY   s   
rO   c           	      C   s   g }t | ddd^}t|D ]H\}}||krP||krPt|}d|v rK|d dd|d< d|d  |d< |d dd	|d< t|d ||d< || ||krV n	qW d    |S W d    |S 1 sjw   Y  |S )
Nr"   z	utf-8-sig)encodingtextu   ﻿rA   r   u   …z...)	r,   	enumerater9   r:   r   r   splitrO   r0   )	r1   startendlang_idmanifest_lines_batchr3   line_ir=   r>   r   r   r    get_manifest_lines_batchk   s,   



rY   c                 C   sF   g }| D ]}||j jv r||j j| q|t|j j q|S )N)decoder
vocabularyr0   indexlen)rQ   r   tokens	characterr   r   r    get_char_tokens   s   
r`   c                 C   s   i ddddddddd	d
dddddddddddddddddddd
dddddddd}||v rG|| | krGdS dS )zFreturns True if ref_text is a subscript or superscript version of textu   ⁰0   ¹1   ²2   ³3u   ⁴4u   ⁵5u   ⁶6u   ⁷7u   ⁸8u   ⁹9u   ₀u   ₁u   ₂u   ₃u   ₄u   ₅u   ₆)u   ₇u   ₈u   ₉TFr   )ref_textrQ   sub_or_superscript_to_numr   r   r    is_sub_or_superscript_pair   sV   	
rp   c                 C   sJ   d| v r|  dd} d| v sd| v r|  dd} d| v sdd |D }|S )Nu   ▁▁u   ▁__r   c                 S   r&   r   r   )r$   tokenr   r   r    r)      r*   z&restore_token_case.<locals>.<listcomp>)r   )wordword_tokensword_tokens_casedr   r   r    restore_token_case   s   #rv   c                   @   sV   e Zd ZU dZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed< dS )TokenNrQ   
text_caseds_starts_endt_startt_end)__name__
__module____qualname__rQ   rE   __annotations__rx   ry   intrz   r{   floatr|   r   r   r   r    rw      s   
 rw   c                   @   s`   e Zd ZU dZeed< dZeed< dZeed< dZ	e
ed< dZe
ed< eedZee ed< dS )	WordNrQ   ry   rz   r{   r|   default_factoryr^   )r}   r~   r   rQ   rE   r   ry   r   rz   r{   r   r|   r   listr^   r   rw   r   r   r   r    r      s   
 r   c                   @   sl   e Zd ZU dZeed< dZdZeed< dZ	eed< dZ
eed< dZeed< eedZeeeef  ed< dS )	SegmentNrQ   ry   rz   r{   r|   r   words_and_tokens)r}   r~   r   rQ   rE   r   	pred_textry   r   rz   r{   r   r|   r   r   r   r   r   r   rw   r   r   r   r    r      s   
 "r   c                   @   sx   e Zd ZU eedZee ed< eedZ	ee
eef  ed< dZeed< dZeed< dZeed< eedZeed< dS )		Utterancer   token_ids_with_blankssegments_and_tokensNrQ   r   r   saved_output_files)r}   r~   r   r   r   r   r   r   r   r   r   r   rw   rQ   rE   r   r   dictr   r   r   r   r    r      s   
 r   c                 C   s  |s| g}n|  |}dd |D }dd |D }t| ||d}t|drzt|dr/|j}	nt|jj}	|	g|_t| dkrA|S |j| }
d}t	dt|
D ]}|
| |
|d  kr`|d7 }qPt|
| |krtt
d	| d
 |S |jtttddd d}d}|D ]}|j|}|jt|||t|d  d d |t|d 7 }| d}t|D ]\}}|j|}|j|}t||}|jd jt|||t|d  d d |t|d 7 }tt|||D ]P\}\}}}|j||	g |jd jd jt||t|jd t|jd d |t|d k r=|jd jd jtttt|jd t|jd d q|t|d k r`|jd jtttt|jd t|jd d q|jtttt|jd t|jd d q|S t|jdr\t|jj}	|jjd}|	g|_t| dkr|S t| |}
d}t	dt|
D ]}|
| |
|d  kr|d7 }qt|
| |krt
d	| d
 |S |jtttddd d}d}t|D ]t\}}t||}|jt|||t|d  d d |t|d d 7 }| d}t|D ]\}}t|}t||}|jd jt|||t|d  d d |t|d d 7 }tt||D ]V\}\}}|j|g |jd jd jt||t|jd t|jd d |t|d k r|j|	g |jd jd jtttt|jd t|jd d qH|t|d k r|j|	||	g |jd jtttt|jd t|jd d |jd jtttt|jd t|jd d |jd jtttt|jd t|jd d q|j|	g |jtttt|jd t|jd d |t|d k rX|j||	g |jtttt|jd t|jd d |jtttt|jd t|jd d q|S td)at  
    Function to create an Utterance object and add all necessary information to it except
        for timings of the segments / words / tokens according to the alignment - that will
        be done later in a different function, after the alignment is done.

        The Utterance object has a list segments_and_tokens which contains Segment objects and
        Token objects (for blank tokens in between segments).
        Within the Segment objects, there is a list words_and_tokens which contains Word objects and
        Token objects (for blank tokens in between words).
        Within the Word objects, there is a list tokens tokens which contains Token objects for
        blank and non-blank tokens.
        We will be building up these lists in this function. This data structure will then be useful for
        generating the various output files that we wish to save.
    c                 S   s   g | ]}|  qS r   )stripr$   segr   r   r    r)   (  r+   zget_utt_obj.<locals>.<listcomp>c                 S   s   g | ]
}t |d kr|qS )r   )r]   r   r   r   r    r)   *  s    )rQ   r   r   	tokenizerblank_idr   r#   z
Utterance zv has too many tokens compared to the audio file duration. Will not generate output alignment files for this utterance.)rQ   rx   ry   rz      )rQ   ry   rz   r   r[      z Cannot get tokens of this model.) rS   r   hasattrr   r]   r   vocabr   text_to_idsr.   r   infor   r0   rw   r   text_to_tokensr   rR   rv   r   r   zipextendr^   rZ   r[   r\   r`   r   r   RuntimeError)rQ   r   language_id	separatorTr   r   segmentsuttBLANK_ID
all_tokensn_token_repetitionsi_toksegment_s_pointerword_s_pointersegmentsegment_tokenswordsword_irs   rt   word_token_idsru   token_irr   token_idtoken_casedSPACE_ID	i_segmenti_wordr   r   r    get_utt_obj  s  


	





	

r   c                 C   s  t  }t  }d}t|D ]\}}||kr"|||< |dkr"|d ||< |}qt|d ||< | jD ]}t|tu r|}	||	j | |	_||	j d | |	_	|	j
D ]m}
t|
tu r|
}||j | |_||j d | |_	|jD ]&}|j|v r~||j | |_nd|_|j|v r||j d | |_	qnd|_	qnqO|
}|j|v r||j | |_nd|_|j|v r||j d | |_	qOd|_	qOq0|}|j|v r||j | |_nd|_|j|v r||j d | |_	q0d|_	q0| S )a|  
    Function to add t_start and t_end (representing time in seconds) to the Utterance object utt_obj.
    Args:
        utt_obj: Utterance object to which we will add t_start and t_end for its
            constituent segments/words/tokens.
        alignment_utt: a list of ints indicating which token does the alignment pass through at each
            timestep (will take the form [0, 0, 1, 1, ..., <num of tokens including blanks in uterance>]).
        output_timestep_duration: a float indicating the duration of a single output timestep from
            the ASR Model.

    Returns:
        utt_obj: updated Utterance object.
    r   r   r#   )r   rR   r]   r   typer   ry   r{   rz   r|   r   r   r^   )utt_objalignment_uttoutput_timestep_duration!num_to_first_alignment_appearance num_to_last_alignment_appearanceprev_stssegment_or_tokenr   word_or_tokenrs   rr   r   r   r    add_t_start_end_to_utt_obj  s   






-

r   audio_filepaths_batchc                 C   s~  d}t  }| D ]}t|}t|t}t|dkr$|dt|d d tdt|d D ].}||d  d d || d< t|| d | d|| d< t|| d | d|| d	< q-t|d |d
 d< t|d
 d | d|d
 d< t|d
 d | d|d
 d	< t||dd}t	
 }g }	t|D ]\}}
|jd| d }t|||
d |
d   |	| q|	|d||< q|S )Ni>  r   r#   )rT   rU   rT   rU   r   
start_secsend_secsr         /z.wav)chunk_pathsspeech_timestamps)r   
read_audioget_speech_timestampsr   r]   r0   r.   roundadjust_timestampstempfileTemporaryDirectoryrR   name
save_audio)r   sample_rate	tmp_paths	file_pathwavr   iadjusted_timestamps
tmpdirnamecurrent_chunk_pathsspeech_dictpathr   r   r    
get_chunks  sH   
r   r   r   max_chunk_duration_smin_chunk_duration_sc                 C   s  | sg S d| d d< | D ]}t |d | d|d< t |d | d|d< qg }| d d }| d d }| d d }tdt| D ]0}	|| }
|
|k rQ| |	 d }q@t|||
||}|| | |	 d }| |	 d }| |	 d }q@|| }
t|||
||}|| |S )z
    Takes the speech timestamps output by the vad model and further
    splits/merges based on the max_chunk_duration_s/min_chunk_duration_s.

    Returns a list of adjusted timestamps.
    r   rT   r   r   rU   r   r#   )r   r.   r]   windowed_chunkingr   )r   r   r   r   r   r   
curr_startcurr_start_secscurr_end_secsr   chunk_durationchunked_timestampsr   r   r    r   6  sH   

r   r   r   r   c           
      C   s   g }| }|}|}|dkrA|| dkr|n|}	||	8 }| t||t||	 | ||	 d |d d d }t|| d}|dks|S )a`  
    Checks if the  chunk duration is greater than the max_chunk_duration_s.
    If it is greater, divide into chunks of max_chunk_duration_s till chunk
    duration is fully split and return split timestamps.

    For example, if chunk duration is 10 seconds and max_chunk_duration_s is 3 seconds,
    the split timestamps will be 3s, 3s, 3s, 1s.
    r   )rT   r   rU   r   r   rU   r#   r   )r0   r   r   )
r   r   r   r   r   r   rT   r   remaining_chunk_duration
chunk_sizer   r   r    r   w  s*   
r   speech_timestamp	timestepsc                 C   s&   | d | d  | }t || d}|S )NrU   rT   r   )r   )r   r   r   r   r   r   r   r    get_output_timestep_duration  s   r   outputoutput_timestep_duration_logitsc                 C   s   t |d |d  | }| jd |kr| d |d d f S || jd  }t|| jd fj| jd}d|d d df< tj| |fddS )NrU   rT   r   r#   )devicer   )axis)r   shapetorchzerostor   cat)r   r   r   required_timestepsblanks_timestepsblanksr   r   r    resize_model_output  s   r   Fr   r   use_silero_vadc           6      C   st  dd | D }t |}g }g }g }|
s|	s|rt|}t  g }d}| D ]\}}|j|d d||d}t|tkrHt |dkrH|d }|s]t|d d |d j	j
d |jjj}||jjj }t|d j	|d d ||d _	||d  td	t |D ]0}t|| j	|d | ||| _	t|d
 j	|| j	f|d
 _	|d
  jd|| j 7  _q|  q)W d   n1 sw   Y  n6|j|d||d}t|tkrt |dkr|d }nt  |j|d|d}W d   n1 sw   Y  |D ]}||j	 ||j	j
d  ||j q n=|d }|d }|d }t|ddD ]*}|  |||| |j||dd\}}|| ||j
d  || q,g }g } g }!t| D ]|\}"}#|rrd||"  }$n|#d dd}$t|$|||||" ||" t||" |}%|r||" |%_t |%jdkrt d|%j! d d|#v r|#d |%_n|#d |%_t |%jdkrt d|%j! d ||%j" | t |%j" |!|% qat#|}&t#| }'t$|drt |j%j&d	 }(nt |j'j(d	 }(t)|})t)| }*t*t+||&|(f }+t|D ]\},}-|-j
d }.|-|+|,d|.ddf< q|(tj+||'ftj,d }/t|D ]\},}0|*|, }1t)|0|/|,d|1f< q;|du rd|jjvr`t-dd|jjvrkt-dt./|d }2|2j0|2j1 }3W d   n	1 sw   Y  |3|jjj2 }4t3|4t4|)d  }5|jj5j6|5 |jjj }t d |5 d!| d" |+|/|)|*|!|fS )#az  
    Returns:
        log_probs, y, T, U (y and U are s.t. every other token is a blank) - these are the tensors we will need
            during Viterbi decoding.
        utt_obj_batch: a list of Utterance objects for every utterance in the batch.
        output_timestep_duration: a float indicating the duration of a single output timestep from
            the ASR Model.
    c                 S   s   g | ]}|d  qS )r   r   )r$   r=   r   r   r    r)     r+   z'get_batch_variables.<locals>.<listcomp>r   r   T)return_hypothesesr2   r   r   r   r#   r   r   N)r   r2   delaymodel_stride_in_secstokens_per_chunkzSample:)desc)keep_logitsrQ   u   �rA   z'pred_text' of utterance zN is empty - we will not generate any output alignment files for this utterancez'text' of utterance r   )dtypewindow_stridezDon't have attribute 'window_stride' in 'model.cfg.preprocessor' => cannot calculate  model_downsample_factor => stopping processr   zDon't have attribute 'sample_rate' in 'model.cfg.preprocessor' => cannot calculate start  and end time of segments => stopping processz/Calculated that the model downsample factor is z9 and therefore the ASR model output timestep duration is z! -- will use this for all batches)7r]   r   r   no_graditems
transcriber   tupler   
y_sequencer   cfgpreprocessorr   r   r0   r.   r   rQ   cleanup)transcribe_simulate_cache_aware_streamingr   resetread_audio_filerR   r   rS   r   r   r!   r   r   r   r   r   maxr   r   r   rZ   r[   tensorr   onesint64
ValueErrorsf	SoundFileframes
samplerater   r   r   
featurizer
hop_length)6rW   r   r   r   align_using_pred_textr   r   r   vad_chunked_batch_sizesimulate_cache_aware_streaminguse_buffered_chunked_streamingbuffered_chunk_paramsr   Blog_probs_list_batchT_list_batchpred_text_batch
file_paths
hypothesesr   tmp_dirvtmp_hypothesesr   
hypothesisr   r   r   lhyplogitsy_list_batchU_list_batchutt_obj_batchi_liner=   gt_text_for_alignmentr   T_maxU_maxVT_batchU_batchlog_probs_batchblog_probs_uttr   y_batchy_uttU_uttr3   	audio_durn_input_framesmodel_downsample_factorr   r   r    get_batch_variables  sJ  







/
	







r;  )@r9   rG   r   dataclassesr   r   pathlibr   typingr   r   r   r   	soundfiler  r   "indicnlp.normalize.indic_normalizer	   &nemo.collections.asr.models.ctc_modelsr
   2nemo.collections.asr.models.hybrid_rnnt_ctc_modelsr   
nemo.utilsr   	tqdm.autor   utils.constantsr   r   r   hubloadr   utilsr   r   r   r   r!   r7   r?   r@   rO   rY   r`   rp   rv   rw   r   r   r   r   r   rE   r   r   r   r   r   r   Tensorr   boolr;  r   r   r   r    <module>   s   .		
	   q1
A
/




