o
    ݈hK                     @   s    d dl Z d dlmZ dd ZdS )    N)V_NEGATIVE_NUMc              	   C   s"  | j \}}}|j d }| |} ||}||}||}ttj||df|d }	tj| |	fdd}
ttj||f|d }tj|
dddddf d|ddddf d|ddddf< dtj|||ftj|d	 }tj|ddd
}|| }d|ddddf< |dk}t	d|D ]}tj|
dd|ddf d|d}||k}t
tjd||dd|dd ktjd||dd|dd k}tt|d| }|| }tj|ddd
}t|dddf< tj|ddd
}t|ddddf< ||t tj|d|d|dfdd}||d }tj|dd\}}||dd|ddf< qg }t	|D ]U}t|| }t|| }|dkrOd}ntt|||d |f | d }|g}t	|d ddD ]}|t||||f  }|d| qm|d| }|| q9|S )a)  
    Do Viterbi decoding with an efficient algorithm (the only for-loop in the 'forward pass' is over the time dimension).
    Args:
        log_probs_batch: tensor of shape (B, T_max, V). The parts of log_probs_batch which are 'padding' are filled
            with 'V_NEGATIVE_NUM' - a large negative number which represents a very low probability.
        y_batch: tensor of shape (B, U_max) - contains token IDs including blanks in every other position. The parts of
            y_batch which are padding are filled with the number 'V'. V = the number of tokens in the vocabulary + 1 for
            the blank token.
        T_batch: tensor of shape (B, 1) - contains the durations of the log_probs_batch (so we can ignore the
            parts of log_probs_batch which are padding)
        U_batch: tensor of shape (B, 1) - contains the lengths of y_batch (so we can ignore the parts of y_batch
            which are padding).
        viterbi_device: the torch device on which Viterbi decoding will be done.

    Returns:
        alignments_batch: list of lists containing locations for the tokens we align to at each timestep.
            Looks like: [[0, 0, 1, 2, 2, 3, 3, ...,  ], ..., [0, 1, 2, 2, 2, 3, 4, ....]].
            Each list inside alignments_batch is of length T_batch[location of utt in batch].
       )device   )dimNr   )inputr   indexi)dtyper   )shiftsdims)shapetor   torchonescatgatherint8rollrange
logical_orarange	unsqueezelogical_notlogical_andlongmasked_fill_maxintargmaxinsertappend)log_probs_batchy_batchT_batchU_batchviterbi_deviceBT_max_U_maxpadding_for_log_probslog_probs_paddedv_prevbackpointers_rely_shifted_leftletter_repetition_maskt	e_currentt_exceeded_T_batchU_can_be_finalmaskv_prev_shiftedv_prev_shifted2
v_prev_dupcandidates_v_currentbp_relativealignments_batchbT_bU_b	current_ualignment_b rA   _/var/www/eduai.edurigo.com/doc_train/edurigo_ai/Puru/tts/BhasaAnuvaad/utils/viterbi_decoding.pyviterbi_decoding   s   





(	"
&rC   )r   utils.constantsr   rC   rA   rA   rA   rB   <module>   s   