o
    b.f                     @   sB  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ dZerKd	ad
d Ze  e e j rTdndZedZedZedZedZedZe  dZ!e Z"e j re#eZe#eZdd Z$dd Z%dd Z&dd Z'dd Z(dd Z)dS )    N)KeywordProcessor)cosine_similarity)SentenceTransformer)	Sense2Vec)T5ForConditionalGenerationT5Tokenizer)NormalizedLevenshteinTFc                   C   s$   t std td da d S d S )Npunkt	stopwordsT)nltk_resources_initializednltkdownload r   r   Q/var/www/chatrigo.edurigo1.com/get_recommendations/testing/main_static_partial.pyinitialize_nltk_resources   s
   

r   cudacpuzgoogle-t5/t5-basezmsmarco-distilbert-base-v3zramsrigouthamg/t5_squad_v1s2v_oldc                 C   s,   d}t | D ]}| }|d | }q|S )N  )r   sent_tokenize
capitalize)contentfinalsentr   r   r   postprocesstext1   s
   r   c           
         s   t j }|j| dd ddh}ttj}|g d7 }|tjj	
d7 }|j|d |jdd	d
d |jdd}dd |D }t }|D ]}|| qD||  fdd|D }	|	d d S )Nen)inputlanguagePROPNNOUN)z-lrb-z-rrb-z-lcb-z-rcb-z-lsb-z-rsb-english)posg?g      ?average)alpha	thresholdmethod   nc                 S   s   g | ]}|d  qS r   r   .0valr   r   r   
<listcomp>B       z get_keywords.<locals>.<listcomp>c                    s   g | ]}| v r|qS r   r   )r,   keywordkeywords_foundr   r   r.   G   s       )pkeunsupervisedMultipartiteRankload_documentliststringpunctuationr   corpusr
   wordscandidate_selectioncandidate_weighting
get_n_bestr   add_keywordextract_keywords)
originaltextsummarytext	extractorr"   stoplist
keyphraseskeywordskeyword_processorr0   important_keywordsr   r1   r   get_keywords8   s    


rJ   c              	      s~   d | |} j|dddddt}|d |d }}|j||dd	d
ddd} fdd|D }	|	d dd}
|
 }
|
S )Nzcontext: {} answer: {}i  FTpt)
max_lengthpad_to_max_length
truncationreturn_tensors	input_idsattention_mask         H   )rP   rQ   early_stopping	num_beamsnum_return_sequencesno_repeat_ngram_sizerL   c                    s   g | ]	} j |d dqS )T)skip_special_tokens)decode)r,   ids	tokenizerr   r   r.   X   s    z get_question.<locals>.<listcomp>r   z	question:r   )formatencode_plustodevicegeneratereplacestrip)contextanswermodelr^   textencodingrP   rQ   outsdecQuestionr   r]   r   get_questionJ   s   	rn   c           	         s   g }zt j| g dd t j |d} fdd|D }W n   g }Y d}| g}| }|D ]}t|d  | |k rL||vrL||vrL|| q/|dd  S )	N)r    PERSONPRODUCTLOCORGEVENTNORPzWORK OF ARTFACGPENUMFACILITY)sensesr(   c                    sN   g | ]#}|d   dd   dd kr|d   dd  dd  qS )r   |rS   _r   )splitrd   titlere   r+   senser   r   r.   d   s   N z'sense2vec_get_words.<locals>.<listcomp>g333333?r   rS   )s2vget_best_sensemost_similarr|   normalized_levenshtein
similaritylowerappend)	wordtopnquestionoutputr   r%   r   	checklistxr   r~   r   sense2vec_get_words_   s   ,
r   c                    s   t || }t |}t|g  fddttD }t|d D ]8}||d d f }	tj|| d d  f dd}
||	 d| |
dd  }|t| } | || q"fdd D S )Nc                    s   g | ]
}| d  kr|qS r*   r   )r,   i)keywords_idxr   r   r.   s   s    zmmr.<locals>.<listcomp>rS   )axisc                    s   g | ]} | qS r   r   )r,   idx)r<   r   r   r.   {   r/   )	r   npargmaxrangelenmaxreshaper   remove)doc_embeddingword_embeddingsr<   top_nlambda_paramword_doc_similarityword_similaritycandidates_idxr{   candidate_similaritiestarget_similaritiesmmrmmr_idxr   )r   r<   r   r   o   s   

r   c                 C   s   t | ||}t|dkr|S |  g}|| |d |   }	||	g}
||}tt|d}t|
||||}|  g}|D ]}| |  krR||  qA|dd  }|S )Nr   r   rR   rS   )	r   r   r   extendencodeminr   r   r   )r   origsentencer   	lambdavalr   r   sentence_transformer_modeldistractorsdistractors_newembedding_sentencekeyword_embeddingdistractor_embeddingsmax_keywordsfiltered_keywordsr   wrdr   r   r   get_distractors}   s"   



r   )*torchrandomnumpyr   r   r9   r4   	flashtextr   sklearn.metrics.pairwiser   sentence_transformersr   	sense2vecr   transformersr   r   !similarity.normalized_levenshteinr   RUN_INITIALIZATIONr   r   rb   r   is_availablefrom_pretrainedsummary_modelsummary_tokenizerr   question_modelquestion_tokenizer	from_diskr   r   ra   r   rJ   rn   r   r   r   r   r   r   r   <module>   sD   






