o
    ݈h:                     @   sj  d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlm	Z	 d dl
mZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dl m Z  d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- eG dd dZ.eG dd dZ/eG dd dZ0edG dd deZ1dS )    N)	dataclassfield)Path)AnyDictListOptional)BaseStep)EncDecCTCModel)EncDecHybridRNNTCTCModel)FrameBatchASR)setup_model)logging)	OmegaConf)Step)tqdm)add_t_start_end_to_utt_objget_batch_starts_endsget_batch_variablesget_manifest_lines_batch)make_ass_files)make_ctm_files)write_manifest_out_line)viterbi_decodingc                   @   s&   e Zd ZU dZeed< dZeed< dS )CTMFileConfigFremove_blank_tokensr   minimum_timestamp_durationN)__name__
__module____qualname__r   bool__annotations__r   float r#   r#   U/var/www/eduai.edurigo.com/doc_train/edurigo_ai/Puru/tts/BhasaAnuvaad/step/aligner.pyr       s   
 r   c                   @   s   e Zd ZU dZeed< dZeed< dZe	ed< dZ
eed< ed	d
 dZee ed< edd
 dZee ed< edd
 dZee ed< dS )ASSFileConfig   fontsizecentervertical_alignmentFresegment_text_to_fill_space   max_lines_per_segmentc                   C      g dS )N)1   .   =   r#   r#   r#   r#   r$   <lambda>4       zASSFileConfig.<lambda>default_factorytext_already_spoken_rgbc                   C   r-   )N)9      	   r#   r#   r#   r#   r$   r1   7   r2   text_being_spoken_rgbc                   C   r-   )N)         r#   r#   r#   r#   r$   r1   :   r2   text_not_yet_spoken_rgbN)r   r   r   r'   intr!   r)   strr*   r    r,   r   r5   r   r9   r=   r#   r#   r#   r$   r%   *   s   
 r%   c                   @   s\  e Zd ZU dZee ed< dZee ed< dZee ed< dZ	ee ed< dZ
ee ed< dZeed< dZee ed	< dZee ed
< dZeed< dZeed< dZeed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< edd dZee ed< edd dZe ed < ed!d dZ!e"ed"< dS )#AlignmentConfigNpretrained_name
model_pathmanifest_filepath
output_dirlanguage_idFalign_using_pred_texttranscribe_deviceviterbi_device   manifest_batch_sizeuse_silero_vadvad_chunked_batch_sizeTuse_local_attention%additional_segment_grouping_separatoraudio_filepath_parts_in_utt_iduse_buffered_chunked_streamingg?chunk_len_in_secsg      @total_buffer_in_secs    chunk_batch_sizesimulate_cache_aware_streamingc                   C   s   dgS )Nctmr#   r#   r#   r#   r$   r1   \       zAlignmentConfig.<lambda>r3   save_output_file_formatsc                   C      t  S N)r   r#   r#   r#   r$   r1   ]   rW   ctm_file_configc                   C   rY   rZ   )r%   r#   r#   r#   r$   r1   ^   rW   ass_file_config)#r   r   r   rA   r   r?   r!   rB   rC   rD   rE   rF   r    rG   rH   rJ   r>   rK   rL   rM   rN   rO   rP   rQ   r"   rR   rT   rU   r   rX   r   r[   r   r\   r%   r#   r#   r#   r$   r@   >   s.   
 r@   alignerc                   @   s   e Zd Z										ddeeef dee dee dee dee d	ed
ee dee dededefddZde	fddZ
dd ZdS )AlignerNFrI   infrarB   rA   rN   custom_model_classrF   rG   rH   rJ   rK   rL   c                 C   s  || _ | d}| d}| d| d}| dd|  }tj| d| drPt| d| d}dd	 | D | _W d    n1 sJw   Y  ng | _t||||| d
||||	|
|d|d| _	| j	j
d u rx| j	jd u rxtd| j	j
d ur| j	jd urtd| j	jdk rtd| j	jdk rtd| j	jdks| j	jdkrtd| j	jjdk rtd| j	jjdvrtd| j	jj| j	jj| j	jjfD ]}t|dkrtdq| j	jd u rttj rdnd| _nt| j	j| _td| j  | j	jd u rttj rdnd| _nt| j	j| _td| j  | jjdks7| jjdkr<td t | j	| j\| _!}| j!"  t#| j!t$rY| j!j%d d! | j	j&rmtd" | j!j'd#d$d$gd% t#| j!t(st#| j!t$st)d&| j	jjdkrtd'| j	jj d( i | _*| j	j+rt,-| j!j.}t/0|j1d) d*|j1_2d|j1_3|j1j4d+krt5d, t/0|j1d |j1d- }|| j	j6 }| j	j7}t8| j	j9}t:;|| }t:;||| d.  | }td/| d0|  t<| j!|| j	j7| j	j=d1| _!|||d2| _*n|d ur(t>?d3}t@||}|| j!| d
d4| _!tA||	\| _B| _Cd | _DtEtF|jGd }tjH|dd5 tEtF|| }t|d6| _Id S )7Naligner_manifest_pathaligner_languagez
/manifest_z.jsonlaligned_output_path/z_with_output_file_paths.jsonc                 S   s   g | ]	}t |d  qS audio_filepath)jsonloads.0liner#   r#   r$   
<listcomp>}   s    z&Aligner.initialise.<locals>.<listcomp>aligner_language_idT)rA   rB   rC   rD   rE   rF   rG   rH   rJ   rK   rL   rM   rN   z2Both model_path and pretrained_name cannot be Nonez2One of model_path and pretrained_name must be NonerI   z7manifest_batch_size cannot be zero or a negative numberz:vad_chunked_batch_size cannot be zero or a negative number  zGadditional_grouping_separator cannot be empty string or space characterr   z6minimum_timestamp_duration cannot be a negative number)topr(   bottomzMass_file_config.vertical_alignment must be one of 'top', 'center' or 'bottom'   zass_file_config.text_already_spoken_rgb, ass_file_config.text_being_spoken_rgb, and ass_file_config.text_already_spoken_rgb all need to contain exactly 3 elements.cudacpuzBDevice to be used for transcription step (`transcribe_device`) is z9Device to be used for viterbi step (`viterbi_device`) is zOne or both of transcribe_device and viterbi_device are GPUs. If you run into OOM errors it may help to change both devices to be the CPU.ctc)decoder_typezdFlag use_local_attention is set to True => will try to use local attention for model if it allows itrel_pos_local_attn@   )self_attention_modelatt_context_sizezModel is not an instance of NeMo EncDecCTCModel or ENCDecHybridRNNTCTCModel. Currently only instances of these models are supportedz;ctm_file_config.minimum_timestamp_duration has been set to zd seconds. This may cause the alignments for some tokens/words/additional segments to be overlapping.Fg        per_featurez\Only EncDecCTCModelBPE models trained with per_feature normalization are supported currentlywindow_strider+   ztokens_per_chunk is z, mid_delay is )	asr_model	frame_lentotal_buffer
batch_size)delaymodel_stride_in_secstokens_per_chunkmodels)modelrE   )exist_okza+)Jr_   	get_stateospathexistsopen	readlinesdoner@   cfgrB   rA   
ValueErrorrJ   rL   rN   r[   r   r\   r)   r5   lenrG   torchdevicers   is_availabler   inforH   typewarningr   r   eval
isinstancer   change_decoding_strategyrM   change_attention_modelr
   NotImplementedErrorbuffered_chunk_paramsrP   copydeepcopy_cfgr   
set_structmodel_preprocessorditherpad_to	normalizeerrormodel_downsample_factorrR   r"   rQ   mathceilr   rT   	importlibimport_modulegetattrr   startsendsoutput_timestep_durationr?   r   stemmakedirsf_manifest_out)selfr_   rB   rA   rN   r`   rF   rG   rH   rJ   rK   rL   ra   languagerC   rD   fhandrgb_list_	model_cfgfeature_strider   r   	chunk_lenr   	mid_delaymodule	model_clstgt_manifest_nametgt_manifest_filepathr#   r#   r$   
initialisec   s,  


	











zAligner.initialisereturnc                    sF  t t j jD ]\}}t jj|| jj}t fdd|D }| jj	kr*q	t
| j jj jj jj jj j jj jj jj jj j\}}}}}	}
t|||| j}tt|	|D ]8\}\}}t|||
}d jjv rt| jj jj j|| |
}d jjv rt| jj jj}t j | qbq	 j !  d S )Nc                    s   g | ]	}|d   j v qS re   )r   ri   r   r#   r$   rl   D  s    zAligner.run.<locals>.<listcomp>rV   ass)"r   zipr   r   r   r   rC   rE   sumrJ   r   r   rN   rF   rO   r   rK   rL   rU   rP   r   r   rH   	enumerater   rX   r   rD   r[   r   r\   r   r   close)r   startendmanifest_lines_batchr   log_probs_batchy_batchT_batchU_batchutt_obj_batchr   alignments_batchidxutt_objalignment_uttr#   r   r$   run<  st   		zAligner.runc                 C   s   | ` d S rZ   )r   r   r#   r#   r$   cleanup  s   zAligner.cleanup)
NNNNFNNrI   FrI   )r   r   r   r   r?   r>   r   r    r   r   r   r   r#   r#   r#   r$   r^   a   sJ    
	

 ZDr^   )2r   r   rg   r   r   dataclassesr   r   pathlibr   typingr   r   r   r   r   baser	   &nemo.collections.asr.models.ctc_modelsr
   2nemo.collections.asr.models.hybrid_rnnt_ctc_modelsr   0nemo.collections.asr.parts.utils.streaming_utilsr   1nemo.collections.asr.parts.utils.transcribe_utilsr   
nemo.utilsr   	omegaconfr   step_decoratorr   r   utils.data_prepr   r   r   r   utils.make_ass_filesr   utils.make_ctm_filesr   utils.make_output_manifestr   utils.viterbi_decodingr   r   r%   r@   r^   r#   r#   r#   r$   <module>   s>    	"