o
    6h/                  	   @   s   d dl Z d dlm  mZ d dlmZmZ d dlZ	d dl
Zd dlZd dlmZ d dlZd dlmZmZmZ d dlZd dlmZ d dlZd dlZd dlZG dd dZdadefd	d
Zddededeeef fddZdS )    N)	VitsModelAutoTokenizer)Path)OptionalUnionList)ThreadPoolExecutorc                   @   s  e Zd ZdZ					d.dedee ded	ed
ef
ddZdee de	j
fddZdd Zdd Zdd ZdedefddZdedejfddZ		d/dedededejfdd Zd!ejdejfd"d#Zd0ded%edefd&d'Zd0ded%edefd(d)Zd*d+ Zdefd,d-ZdS )1HindiTTSProcessorzz
    Professional Hindi Text-to-Speech Processing System
    Optimized for speed, accuracy, and production deployment
    facebook/mms-tts-hinNTd   
model_namedeviceenable_cuda_optimization
cache_sizeuse_half_precisionc                 C   sh   || _ || _i | _tj|d| _| || _|o| jjdk| _	| 
  |r.tj r.|   |   dS )a  
        Initialize the TTS system with performance optimizations
        
        Args:
            model_name: HuggingFace model identifier
            device: Target device ('cuda', 'cpu', or None for auto-detection)
            enable_cuda_optimization: Enable CUDA-specific optimizations
            cache_size: Size of the audio cache for repeated texts
            use_half_precision: Use FP16 for faster inference (if supported)
        )maxsizecudaN)r   r   audio_cachequeueQueuecache_queue_setup_devicer   typer   _load_modelstorchr   is_available_apply_cuda_optimizations_warmup_model)selfr   r   r   r   r    r   L/var/www/eduai.edurigo.com/doc_train/edurigo_ai/text_to_speech/processing.py__init__   s   zHindiTTSProcessor.__init__returnc                 C   sR   |du rt j rd}nd}t |}|jdkr't j  dt jj_dt jj_	|S )z"Setup optimal device configurationNr   cpuTF)
r   r   r   r   r   empty_cachebackendscudnn	benchmarkdeterministic)r   r   
device_objr   r   r    r   :   s   





zHindiTTSProcessor._setup_devicec                 C   sp   t j| jddd| _tj| j| jrtjntjddd| _	| j	
| j| _	| j	  t| j	dr6d| j	j_dS dS )z#Load models with optimization flagsTz.cache/tokenizers)use_fast	cache_dirz.cache/models)torch_dtypelow_cpu_mem_usager+   configN)r   from_pretrainedr   	tokenizerr   r   r   float16float32modeltor   evalhasattrr.   	use_cacher   r   r   r    r   L   s    
zHindiTTSProcessor._load_modelsc              
   C   s   t j rAtt dr(zt j| jdd| _W n ty' } zW Y d}~nd}~ww t| jdrCz| jjt jd| _W dS    Y dS dS dS )z!Apply CUDA-specific optimizationscompilezmax-autotune)modeNr4   )memory_format)	r   r   r   r6   r9   r3   	Exceptionr4   channels_last)r   er   r   r    r   e   s   

	z+HindiTTSProcessor._apply_cuda_optimizationsc                 C   s   d}t  ' | j|dd| j}| jrdd | D }| jdi |}W d   n1 s0w   Y  | jjdkrBt j	
  dS dS )	z'Warmup model for consistent performanceu   नमस्तेpt)return_tensorsc                 S   *   i | ]\}}||j tjkr| n|qS r   dtyper   r2   half.0kvr   r   r    
<dictcomp>}   s   * z3HindiTTSProcessor._warmup_model.<locals>.<dictcomp>Nr   r   )r   no_gradr0   r4   r   r   itemsr3   r   r   r$   )r   warmup_textinputs_r   r   r    r   v   s   
	zHindiTTSProcessor._warmup_modeltextc                 C   s   dt |   S )zGenerate cache key for texttts_)hashstriplower)r   rO   r   r   r    _get_cache_key   s   z HindiTTSProcessor._get_cache_keykey
audio_datac                 C   sz   t | j| jkr#z| j }|| jv r| j|= W n
 tjy"   Y nw || j|< z	| j| W dS  tjy<   Y dS w )z)Manage audio cache with LRU-like behaviorN)	lenr   r   r   
get_nowaitr   Empty
put_nowaitFull)r   rU   rV   
oldest_keyr   r   r    _manage_cache   s   


zHindiTTSProcessor._manage_cacher7   normalize_audioc              	   C   s  |r|  s
td|  }|r| |nd}|r#|| jv r#| j| S t  | j|ddddd| j}| j	rBdd |
 D }| j	rktj rktjjd	dd
 | jdi |}W d   n1 sew   Y  n| jdi |}t|dr||j}n(t|dr|j}nt|D ]}t||}	t|	tjr|	jdkr|	} nqtd|    }
|r| |
}
W d   n1 sw   Y  |r|r| ||
 |
S )aD  
        Synthesize speech from Hindi text with maximum performance
        
        Args:
            text: Hindi text to synthesize
            use_cache: Use audio cache for repeated texts
            normalize_audio: Normalize output audio
            
        Returns:
            Audio waveform as numpy array
        zText cannot be emptyNr?   Ti   )r@   padding
truncation
max_lengthc                 S   rA   r   rB   rE   r   r   r    rI      s    z0HindiTTSProcessor.synthesize.<locals>.<dictcomp>r   )enabledwaveformaudio   z-Could not find audio output in model responser   )rR   
ValueErrorrT   r   r   rJ   r0   r4   r   r   rK   r   r   ampautocastr3   r6   rc   rd   dirgetattr
isinstanceTensorndimRuntimeErrorsqueezer#   floatnumpy_normalize_audior]   )r   rO   r7   r^   	cache_keyrM   outputrc   	attr_name
attr_valueaudio_npr   r   r    
synthesize   sZ   

	



,zHindiTTSProcessor.synthesizerd   c                 C   s6   |t | }t t |}|dkr|d|  }|S )z@Normalize audio to prevent clipping and ensure consistent volumer   g?)npmeanmaxabs)r   rd   max_valr   r   r    rr      s
   z"HindiTTSProcessor._normalize_audio>  sample_ratec              
   C   s   z(|  |}|jtjkr|tj}t }tj|||dd |	d |
 W S  tyD } ztdt|j dt|   d}~ww )z
        Synthesize speech and return as bytes for API response
        
        Args:
            text: Hindi text to synthesize
            sample_rate: Audio sample rate
            
        Returns:
            Audio data as bytes
        WAV)formatr   zError in synthesize_to_bytes: z: N)rx   rC   ry   r2   astypeioBytesIOsfwriteseekgetvaluer<   printr   __name__str)r   rO   r   rw   bufferr>   r   r   r    synthesize_to_bytes   s   


z%HindiTTSProcessor.synthesize_to_bytesc                 C   s   |  ||}t|dS )z
        Synthesize speech and return as base64 encoded string
        
        Args:
            text: Hindi text to synthesize
            sample_rate: Audio sample rate
            
        Returns:
            Base64 encoded audio data
        zutf-8)r   base64	b64encodedecode)r   rO   r   audio_bytesr   r   r    synthesize_to_base64  s   z&HindiTTSProcessor.synthesize_to_base64c                 C   sL   | j   | j s$z| j  W n tjy   Y dS w | j r
dS dS )z Clear audio cache to free memoryN)r   clearr   emptyrX   r   rY   r8   r   r   r    clear_cache  s   

zHindiTTSProcessor.clear_cachec              	   C   sZ   t | j| jt| j| j| jtj	 tj	 rtj
 ndtj	 r)tj dS ddS )zGet performance statisticsr   )r   half_precisionr   max_cache_sizer   cuda_availablecuda_memory_allocatedcuda_memory_reserved)r   r   r   rW   r   r   r   r   r   r   memory_allocatedmemory_reservedr8   r   r   r    	get_stats%  s   zHindiTTSProcessor.get_stats)r
   NTr   T)TT)r~   )r   
__module____qualname____doc__r   r   boolintr!   r   r   r   r   r   r   rT   ry   ndarrayr]   rx   rr   bytesr   r   r   dictr   r   r   r   r    r	      sP    
#
K	r	   r"   c                   C   s   t du rtdddda t S )z/Get or create the global TTS processor instanceNT2   )r   r   r   )_tts_processorr	   r   r   r   r    get_tts_processor6  s   r   r   rO   output_formatc                 C   s"   t  }|dkr|| S || S )z
    Main processing function for text to speech conversion
    
    Args:
        text: Hindi text to convert to speech
        output_format: Output format - 'bytes' or 'base64'
        
    Returns:
        Audio data in requested format
    r   )r   r   r   )rO   r   	processorr   r   r    process_text_to_speechA  s   

r   )r   ) r   torch.nn.functionalnn
functionalFtransformersr   r   	soundfiler   rq   ry   timepathlibr   gctypingr   r   r   	threadingconcurrent.futuresr   r   r   r   r	   r   r   r   r   r   r   r   r   r    <module>   s(      %$