o
    ݈hn5                     @   s   d dl Z d dlZd dlZd dlmZmZmZmZmZ d dl	Z	d dl
Z
d dlZd dlZd dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ ed	G d
d deZdS )    N)AnyDictListOptionalTuple)BaseStep)AudioSegment)Step)tqdm)AutoTokenizer)M2M100Encoderbuild_final_manifestc                   @   s@  e Zd Zdeeef dededededefddZd	efd
dZdee dee de	e fddZ
dd Zd;ddZdd Zdd ZdejdejfddZdedefdd Zd<d"e	e d#efd$d%Zd&ed'ed(ed)efd*d+Zd,ejd-ejfd.d/Zd,ejd-ejfd0d1Zd2eeef d3eeef fd4d5Zd6d7 Zd8d9 Zd:S )=BuildFinalManifestinfra
batch_sizeknn_neighborhoodmargin_algorithmsonar_devicemining_devicec                 C   s@  || _ || _|| _|| _|| _|| _| d| _| d| _| d| _	| dd| j	  | _
| d| _| dd | j	 d| j	 d	 | _td
| j| _td
| _i dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,| _tj| jd-d. tj| j
d-d. d S )/Ninput_manifest_pathfinal_manifestaligner_languagechunked_audio_path/aligner_language_idaligned_output_path
/manifest_z_with_output_file_paths.jsonz#cointegrated/SONAR_200_text_encodereneng_Latnhihin_Devabnben_Bengguguj_Gujrknkan_Kndamlmal_Mlymmrmar_Devanenpi_Devaorory_Oryapapan_Gurutatam_Tamltetel_Teluururd_Arabmnimni_Bengasasm_Bengsdsnd_Arabsasan_DevaTexist_ok)r   r   r   r   r   r   	get_stater   manifest_out_folderlanguagechunked_audio_out_folderr   segment_manifest_pathr   from_pretrainedtoencoderr   	tokenizersonar_lang_mappingosmakedirs)selfr   r   r   r   r   r    rN   b/var/www/eduai.edurigo.com/doc_train/edurigo_ai/Puru/tts/BhasaAnuvaad/step/build_final_manifest.py
initialise   s   		
zBuildFinalManifest.initialisectm_pathc              
   C   s   t |d}| }W d    n1 sw   Y  g }|D ](}|d}|t|d t|d |d dd |d dd f q|S )Nr          z<space>   )open	readlinessplitappendfloatreplacestrip)rM   rQ   fhandlinesdatalinepartsrN   rN   rO   _read_ctm_fileQ   s   



	z!BuildFinalManifest._read_ctm_filetextpath
separatorsc                 C   sp   |d u rt |d}| }W d    n1 sw   Y  tdd|}dd tdttj||D }|S )NrR   z[-_
\s]+rS   c                 S   s    g | ]}|  d kr|  qS ) )r^   ).0sentrN   rN   rO   
<listcomp>m   s
    z1BuildFinalManifest._read_text.<locals>.<listcomp>|)rX   readresubrZ   joinmapescape)rM   re   rf   rg   r_   sentsrN   rN   rO   
_read_texte   s   
zBuildFinalManifest._read_textc                 C   s6   |j d }t|}|| |||\}}||fS N   )shapefaissIndexFlatIPaddsearch)rM   xykdimidxsimindrN   rN   rO   _knnCPUu   s
   


zBuildFinalManifest._knnCPU        c              	   C   s  |j d }||d  }tj|j d |ftjd}tj|j d |ftjd}td|j d |D ]}	t|	| |j d }
g g }}td|j d |D ]=}t|| |j d }t|}t	|}|
|||  |||	|
 t||| \}}|| |||  ~qHtj|dd}tj|dd}tj| dd}t|	|
D ]-}t|D ]&}|||	 |||	 |f f |||f< |||	 |||	 |f f |||f< qqq.||fS )Nrv   rV   r   )dtypeaxis)rw   npzerosfloat32int64rangeminrx   ry   index_cpu_to_all_gpusrz   r{   r[   concatenateargsort)rM   r|   r}   r~   memr   r   r   r   xfromxtobsimsbindsyfromytor   bsimbindauxijrN   rN   rO   _knnGPU|   s4   



"
$&zBuildFinalManifest._knnGPUc                 C   s   || ||| d S )NrT   )dot)rM   r|   r}   fwd_meanbwd_meanmarginrN   rN   rO   _score   s   zBuildFinalManifest._scorec              	   C   sn   t |j}t|jd D ]'}t|jd D ]}	|||	f }
| || ||
 || ||
 ||||	f< qq|S )Nr   rv   )r   r   rw   r   r   )rM   r|   r}   candidate_indsr   r   r   scoresr   r   r~   rN   rN   rO   _score_candidates   s   ,z$BuildFinalManifest._score_candidates
fwd_scoresx2y_indc              	   C   s0  t  }t  }t }tt|D ]0}t|| }|||f |v r&|| q||||f  |||||f |||f f q|D ]J}d}t|| }	tt|	d ddD ]*}
|	|
 }|||f |v rfqWd}|||||f |||f f ||||f   |s||ddf qBt	|dd dd}|S )NFrT   Tc                 S   s   | d S )Nr   rN   )r|   rN   rN   rO   <lambda>       z/BuildFinalManifest._get_pairs.<locals>.<lambda>)keyreverse)
setlistr   lenr   argmaxrz   r[   r   sorted)rM   r   r   seen_srcskippedpairsr   max_inddonefwd_indr   rN   rN   rO   
_get_pairs   s4   
" zBuildFinalManifest._get_pairs	pred_textc                 C   s"   dt j||t|t|   S ru   )enchantutilslevenshteinr   )rM   re   r   rN   rN   rO   _calculate_alignment_score   s   z-BuildFinalManifest._calculate_alignment_scoreFtextslang_idc              	   C   s   | j | }|| j_g }t U tdt|| jD ]C}| j|||| j  ddddd| j	}| j
d	i |j}|j}	||	d d|	dd }
|rWtjj|
}
||
 qW d    n1 sgw   Y  tj|ddS )
Nr   ptTi   )return_tensorspadding
truncation
max_lengthr   rv   r   rN   )rJ   rI   src_langtorchinference_moder   r   r   rG   r   rH   last_hidden_stateattention_mask	unsqueezesumnn
functional	normalizer[   cat)rM   r   r   normlangembsr   batchseq_embsmaskmean_embrN   rN   rO   _encode_mean_pool   s6   

z$BuildFinalManifest._encode_mean_poolaudio_filepath
start_timedurationr   c                 C   s   d |dd dd d }| jd|  }| d| d}tj|dd t|d }|t|d  }	t|}
|
||	 }|| |S )	Nrh   r   r   .z.wavTr?   i  )	rp   rZ   rD   rK   rL   roundr   from_wavexport)rM   r   r   r   r   dir
out_folderfinal_audio_pathstartendaudio
sound_cliprN   rN   rO   _chunk_audio   s   "

zBuildFinalManifest._chunk_audiosegment_embeddingsinput_embeddingsc                 C   s   t | t | | jdkr| jn| j}|||t|jd | j\}}|jdd}|||t|jd | j\}}|jdd}	| j	dkrIdd }
n| j	dkrSd	d }
nd
d }
| 
|||||	|
}| ||S )Ncudar   rv   r   absolutec                 S   s   | S NrN   abrN   rN   rO   r     s    z4BuildFinalManifest._mine_sentences.<locals>.<lambda>distancec                 S   s   | | S r   rN   r   rN   rN   rO   r     r   c                 S   s   | | S r   rN   r   rN   rN   rO   r     r   )rx   normalize_L2r   r   r   r   rw   r   meanr   r   r   )rM   r   r   knnx2y_simr   x2y_meany2x_simy2x_indy2x_meanr   r   rN   rN   rO   _mine_sentences   s:   





	z"BuildFinalManifest._mine_sentencesc                    s^   t j|ddd}t j|ddd}t||j}|jdd\  fddtt D }|S )NrT   rv   )pr   )r   c                    s&   g | ]}| |   |   fqS rN   )item)ri   r   max_indices
max_valuesrN   rO   rk   0  s    z<BuildFinalManifest._mine_sentences_brute.<locals>.<listcomp>)Fr   r   mmTmaxr   r   )rM   r   r   segment_norm
input_norm
similarityoutputrN   r   rO   _mine_sentences_brute'  s   
z(BuildFinalManifest._mine_sentences_brute
input_linesegment_linec                 C   s  | d}|d u stj|sd S | |}g }g }g }t|D ]\\}}	| |d |	d |	d |}
||	d  ||	d  ||	d |	d |	d |	d | |	d |	d |d |d |
d | d	ro|d	 |d
 d	< | dr||d |d
 d< q | 	|| j
}|d D ]O}|d }| | d| d|d }t|dkrq| 	||}| ||}|D ]$\}}}|| | d|d
kr|| nd| d|d
kr|ndi qqdd |D S )Nsegments_level_ctm_filepathr   r   rv   rT   rU   alignment_audio_path)re   r   r   r   alignment_scoreaudio_filepath_originalr   chunked_audio_filepath	course_idr   video_idtext_miningr   re   rf   rg   _textrh   _mining_scorec                 S   s   g | ]	}t j|d dqS )F)ensure_ascii)jsondumps)ri   rb   rN   rN   rO   rk   z  s    z:BuildFinalManifest._get_manifest_lines.<locals>.<listcomp>)getrK   rf   existsrd   	enumerater   r[   r   r   r   rt   r   r  update)rM   r  r  rQ   ra   manifestsegment_sentssegment_pred_sentsr   rowr  r   miningr   input_sentsr   r   seg_idxinp_idxscorerN   rN   rO   _get_manifest_lines7  sl   




z&BuildFinalManifest._get_manifest_linesc              	   C   s   t | jd}| }W d    n1 sw   Y  t | jd}| }W d    n1 s/w   Y  tt||D ];\}}| t|t|}|d urvt | j	 d| j
 dd}|d|d  W d    n1 sqw   Y  q;d S )NrR   r   z.jsonlza+
)rX   rE   rY   r   r
   zipr#  r  loadsrB   rC   writerp   )rM   r_   segment_linesinput_linesr  r  r  rN   rN   rO   run|  s&   

zBuildFinalManifest.runc                 C   s   | ` | `d S r   )rH   rI   )rM   rN   rN   rO   cleanup  s   zBuildFinalManifest.cleanupN)r   )F)__name__
__module____qualname__r   strintrP   rd   r   r   rt   r   r   r   r   r   ndarrayr   r   r   r\   r   r   r   Tensorr  r   r#  r*  r+  rN   rN   rN   rO   r      sn    

<

	+

(



Er   )r  rK   rn   typingr   r   r   r   r   r   rx   numpyr   r   torch.nn.functionalr   r   r   baser   pydubr   step_decoratorr	   r
   transformersr   ,transformers.models.m2m_100.modeling_m2m_100r   r   rN   rN   rN   rO   <module>   s"    