o
    jghak                  
   @   sh  d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	m
Z
 d dlmZmZmZ d dlZd dlmZmZ d dlZd dlmZ d dlmZ d dlZd dlmZ d dlmZ d dlZd d	lmZ d dl Z e!ej"j# e$e%Z&eG d
d dZ'G dd dZ(e( Z)dde*de+dee* fddZ,de*dee* de*fddZ-dde*de+deee*e*f  fddZ.dS )    N)ListDictAnyOptionalTuple)
quote_plusurljoinurlparse)ThreadPoolExecutoras_completed)	dataclass)defaultdict)	lru_cache)BeautifulSoup)	UserAgentc                   @   sb   e Zd ZU dZeed< eed< eed< eed< eed< dZeed< d	Ze	e ed
< dZ
eed< d	S )SearchResultz,Enhanced search result with URL and metadatatitleurlsnippetsourcerelevance_score domainN	timestamp        content_quality)__name__
__module____qualname____doc__str__annotations__floatr   r   r   r    r#   r#   a/var/www/eduai.edurigo.com/doc_train/edurigo_ai/my_career/testing/advanced_chatbot/search_tool.pyr      s   
 r   c                   @   s   e Zd ZdZdd ZdefddZd$ded	ed
edede	e
j f
ddZd%dededefddZdefddZdedededefddZd&dedededee fddZd'dededee fdd Zd!ee dee fd"d#ZdS )(ReliableWebSearchEnginezLHighly reliable web scraping search engine with multiple fallback strategiesc                 C   s,  t  t t d| _| j D ]\}}d|_d|_|j	dddddd	dd
 qg d| _
g dddddddg dg dg dg ddddgdddddg dg dddgg ddd d!gdd"dd#d$d%gd&d'gd'd(gd)d*gdd+d,gdd-dd.d/d0gd1d2gd3d4gd5d6gdd7| _tt| _d S )8Ncloudscraperrequestsrequests_altF   Jtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8en-US,en;q=0.9gzip, deflate
keep-alive1no-cache)AcceptAccept-LanguageAccept-Encoding
ConnectionUpgrade-Insecure-RequestsCache-ControlDNT)zoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36zuMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36zeMozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36zPMozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0zTMozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/121.0z_Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/121.0.0.0)zhttps://www.google.com/searchzhttps://google.com/searchzhttps://www.google.co.uk/searchr   enusr   )qnumhlglstart)zdiv.gzdiv[data-sokoban-container]z.rcz
div.yuRUbf)h3z	h3.LC20lbzh3.rzdiv.yuRUbf h3)azdiv.yuRUbf azh3 a)z.VwiC3bz.s3v9rdz.stzspan.aCOpRez
div.IsZvec)urlsparamsresult_selectorstitle_selectorslink_selectorssnippet_selectorszhttps://www.bing.com/searchzhttps://bing.com/search   )r:   countsetlangfirst)z	li.b_algozli.b_adz.b_algo)h2h2 az.b_titler@   rL   )z.b_caption pz
.b_captionz
.b_snippetzhttps://duckduckgo.com/htmlz https://html.duckduckgo.com/htmlzus-en)r:   klsz.resultz
div.resultz.result__titlez.result__title aza.result__az.result__snippetz.result__bodyzhttps://yandex.com/searchzhttps://www.yandex.com/searchT   )textlrnumdocz
.serp-itemzli.serp-itemzh2.serp-item__titlez.serp-item__titlezh2.serp-item__title az.serp-item__title az.serp-item__textz.serp-item__snippetgooglebing
duckduckgoyandex)r'   create_scraperr(   Sessionsessionsitemsverifytimeoutheadersupdateuser_agentssearch_enginesr   intsuccess_stats)selfsession_namesessionr#   r#   r$   __init__(   s   

"

lz ReliableWebSearchEngine.__init__returnc                 C   s   t | jS )zGet a random user agent)randomchoicer`   )rd   r#   r#   r$   _get_random_user_agent   s   z.ReliableWebSearchEngine._get_random_user_agent   r   rB   r^   max_retriesc                 C   s&  g d}t |D ]}|D ]t}zU| j| }| }	|  |	d< ttdd |j|||	ddd}
|
j	dkrF| j
|  d	7  < |
W     S |
j	d
krVttdd W qtd|
j	 d|  W q ty } ztd| d|  W Y d}~qd}~ww ||d	 k rttd	d qdS )z,Make request with multiple session fallbacksr&   
User-Agent333333?      ?r*   T)rB   r^   r]   allow_redirects   rG   i        zHTTP  from zRequest failed with : Nrl   )rangerZ   copyrk   timesleepri   uniformgetstatus_coderc   loggerwarning	Exception)rd   r   rB   r^   rm   session_orderattemptre   rf   current_headersresponseer#   r#   r$   _make_request_with_fallback   s@   


z3ReliableWebSearchEngine._make_request_with_fallbackNbase_urlc                    s   |sdS | dr|rt|}|j d|j | S |S | dr:|dd dd }ztj|}W n   Y d|v r@dS g d	}zt| t fd
d|D rWW dS W n   Y | dsfd| }|S )zClean and normalize URLsr   /z://z/url?q=rG   &r   zbing.com/ck/a)z
google.comzbing.comzduckduckgo.comz
yandex.comc                 3   s    | ]}| j v V  qd S N)netloc).0r   
parsed_urlr#   r$   	<genexpr>  s    z5ReliableWebSearchEngine._clean_url.<locals>.<genexpr>)zhttp://https://r   )	
startswithr	   schemer   splitr(   utilsunquoteany)rd   r   r   parsed_baseskip_domainsr#   r   r$   
_clean_url   s4   


z"ReliableWebSearchEngine._clean_urlc                 C   s8   |sdS |j dd}tdd|}tdd|}| S )z5Extract clean text content from BeautifulSoup elementr   T)stripz\s+ z[\r\n\t])get_textresubr   )rd   elementrP   r#   r#   r$   _extract_text_content  s   z-ReliableWebSearchEngine._extract_text_contentqueryr   r   c           
      C   s   |r|sdS t |  }t |  }|r t |  nt  }|r/t||@ t| nd}|r=t||@ t| nd}|d |d  }	| | v rT|	d7 }	n| | v r`|	d7 }	t|	dS )z+Calculate relevance score for search resultr   r   ffffff?ro   g?rp   )setlowerr   lenmin)
rd   r   r   r   query_wordstitle_wordssnippet_wordstitle_overlapsnippet_overlap	relevancer#   r#   r$   _calculate_relevance  s   

z,ReliableWebSearchEngine._calculate_relevance
   engine_namenum_resultsc                 C   s  || j vrg S | j | }g }|d D ]b}z=|  dddddddd}|d	kr2|d
ddd n|dkr:d|d< |d  }d|v rI||d< nd|v rQ||d< | |||}	|	s\W qt|	jd}
g }|d D ]}|
|}|ru|} nqht	|
  dt| d |d| D ]}zd}|d D ]}||}|r| |}|rt|dkr nq|sW qd}|d D ]}||}|r|dd}| ||}|r|dr nq|sW qd}|d D ]}||}|r| |}|r nq| |||}t|dd  ||r
|dd! nd||t|j |d"}|| W q ty; } ztd#| d$|  W Y d}~qd}~ww t	|
  d%t| d& |rRW  |S W q tyu } zt|
  d'| d(|  W Y d}~qd}~ww |S ))z9Scrape a specific search engine with enhanced reliabilityrA   r+   r,   r-   r.   r/   r0   )rn   r1   r2   r3   r4   r5   r6   r7   rT   documentnavigatenone)zSec-Fetch-DestzSec-Fetch-ModezSec-Fetch-SiterU   zhttps://www.bing.com/RefererrB   r:   rP   zhtml.parserrC   z: Found z result elementsNr   rD   rl   rE   hrefhttprF   rr   i,  )r   r   r   r   r   r   r   zError parsing z	 result: z: Successfully extracted  resultsz scraping error with rv   )ra   rk   r_   rx   r   r   contentselectr~   info
capitalizer   
select_oner   r|   r   r   r   r   r	   r   r   appendr   r   error)rd   r   r   r   engine_configresultsr   r^   rB   r   soupresult_elementsselectorelementsr   r   title_selector
title_elemr   link_selector	link_elemr   snippet_selectorsnippet_elemr   resultr   r#   r#   r$   _scrape_search_engine2  s   











z-ReliableWebSearchEngine._scrape_search_enginert   c                    s  r  sg S   g  g d}|D ]J}z*|d }|r? | t|  dt| d t kr?W  n W q ty^ } zt|  d|  W Y d}~qd}~ww t k rtd t	ddh fd	d
|D }fdd|dd D }t
|D ]B}	||	 }
z|	jdd}|r | t|
  dt| d W q ty } zt|
  d|  W Y d}~qd}~ww W d   n1 sw   Y   std g S  }t|dd dd}tdt|  |d S )z1Search multiple engines with enhanced reliabilityrS   rs   z: Got r   z search failed: Nz0Attempting parallel search for better results...)max_workersc                    s"   g | ]}|d d  D vr|qS )c                 S   s   g | ]}|j qS r#   )r   )r   rr#   r#   r$   
<listcomp>  s    zNReliableWebSearchEngine.search_multiple_engines.<locals>.<listcomp>.<listcomp>r#   )r   r   )all_resultsr#   r$   r     s   " zCReliableWebSearchEngine.search_multiple_engines.<locals>.<listcomp>c                    s    i | ]}  j||qS r#   )submitr   )r   engine)executorr   r   rd   r#   r$   
<dictcomp>  s    zCReliableWebSearchEngine.search_multiple_engines.<locals>.<dictcomp>   )r]   z additional resultsz parallel search failed: z!No results from any search enginec                 S      | j S r   r   xr#   r#   r$   <lambda>      zAReliableWebSearchEngine.search_multiple_engines.<locals>.<lambda>TkeyreversezTotal unique results: )r   r   extendr~   r   r   r   r   r   r
   r   r   r   _deduplicate_resultssorted)rd   r   r   engines_to_user   r   r   remaining_enginesfuture_to_enginefuturer   unique_resultsranked_resultsr#   )r   r   r   r   rd   r$   search_multiple_engines  s^   



$

z/ReliableWebSearchEngine.search_multiple_enginesr   c                 C   s   |sg S g }t  }t  }t|dd dd}|D ]Q}|j|v rqtdd|j  }t | }d}	|D ] }
t |
 }|rU|rUt	||@ t	||B  }|dkrUd}	 nq5|	sh|
|j |
| || q|S )	z:Remove duplicate results based on URL and title similarityc                 S   r   r   r   r   r#   r#   r$   r     r   z>ReliableWebSearchEngine._deduplicate_results.<locals>.<lambda>Tr   z[^\w\s]r   Fr   )r   r   r   r   r   r   r   r   r   r   addr   )rd   r   r   	seen_urlsseen_titlessorted_resultsr   title_normalizedr   is_duplicate
seen_title
seen_wordsoverlapr#   r#   r$   r     s4   


z,ReliableWebSearchEngine._deduplicate_results)rl   r   )r   rt   )r   r   r   r   rg   r    rk   dictrb   r   r(   Responser   r   r   r"   r   r   r   r   r   r   r#   r#   r#   r$   r%   %   s     &-'  <r%   rt   r   r   rh   c                 C   sl  | r|   s	dgS ztt|dd}t| |}|sdgW S g }t|dD ]j\}}ddddd	}||jd
|j  d}|j	}t
|dkrO|dd d }dtdtdt|jd  }	|j}
t
|
dkro|
dd d }
| d|j d|
 d|j d| d|	 d|jdd}|| q%|W S  ty } ztd|  dt| dgW  Y d}~S d}~ww )zN
    Perform highly reliable web search with multiple fallback strategies
    z$Please provide a valid search query.rG   r*   z<No relevant results found. Please try rephrasing your query.u   [GOOGLE] 🔍u   [BING] 🔍u   [DUCKDUCKGO] 🦆u   [YANDEX] 🔍rS   []r   N   z...   ⭐rt   rr      r   
u   
🔗 URL:    
🌐 Domain:    
📊 Relevance:  (.2f)zWeb search error: zSearch encountered an error: z. Please try again.)r   r   maxsearch_enginer   	enumerater|   r   upperr   r   rb   r   r   r   r   r   r   r~   r   r    )r   r   r   formatted_resultsir   source_badgessource_badgedomain_namerelevance_starsr   formatted_resultr   r#   r#   r$   perform_web_search  sR   r  search_resultsc                 C   sh  |sdS t dd |D rd|  dS ddddd}d}d}|D ]S}|d	}|rs|d }| D ]}d
| d|v rE||  d7  <  nq1|D ]*}	d|	v rrd|	v rrz|	dd dd }
t|
}||7 }|d7 }W qH   Y qHqHq |dkr||| }ddd | D }|dkrd}n|dkrd}n	|dkrd}nd}d|  dt| d| d| d|dd S )!zE
    Generate intelligent search summary with enhanced analytics
    zNo search results generated.c                 s   s    | ]	}d |  v V  qdS )r   N)r   )r   r   r#   r#   r$   r   Y  s    z%get_search_summary.<locals>.<genexpr>z&Search encountered issues for query: ''r   )GOOGLEBING
DUCKDUCKGOYANDEXr   r   r   r   rG   z
Relevance:(r   z, c                 S   s&   g | ]\}}|d kr| d| qS )r   ru   r#   )r   r   rH   r#   r#   r$   r   z  s   & z&get_search_summary.<locals>.<listcomp>g?	Excellentg333333?Goodg?FairBasicu&   🔍 Enhanced Web Search Results for 'u   ':
📊 Found u$    high-quality results
🌐 Sources: u   
⭐ Average Relevance: r   r   ug   /1.00)
🚀 Multi-engine scraping with reliability fallbacks
🔗 All results verified with direct URLs)r   r   keysr"   joinr[   r   )r   r  source_countsavg_relevancetotal_relevance_scoresr   lines
first_liner   line
score_textscoresource_summaryquality_ratingr#   r#   r$   get_search_summaryR  s\   
r#  c                 C   s   | r|   sg S zTt| |}|sg W S g }|D ]A}d|j  d|j d|j d|j ddtdt	dt
|jd   d	|jd
d}||j|j|j|j|j|jd}|| q|W S  tyx } ztd|  g W  Y d}~S d}~ww )zA
    Get structured search results with enhanced reliability
    r   u   ] 🔍 r   r   r   r   rt   rG   r   r   r   )r   r   r   r   r   r   r   zStructured search error: N)r   r  r   r   r  r   r   r   r   r   rb   r   r   r   r   r~   r   )r   r   r   structured_resultsr   r   structured_resultr   r#   r#   r$   get_structured_search_results  sB   	r&  r   )/r(   jsonloggingr   ry   typingr   r   r   r   r   urllib.parser   r   r	   hashlibconcurrent.futuresr
   r   ri   dataclassesr   collectionsr   	threading	functoolsr   bs4r   urllib3fake_useragentr   r'   disable_warnings
exceptionsInsecureRequestWarning	getLoggerr   r~   r   r%   r  r    rb   r  r#  r&  r#   r#   r#   r$   <module>   s<    
   s;(=