o
    ?iQ                  	   @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	 ddl
mZmZmZmZ ddlmZ ddlZddlmZ ddlZddlmZmZ ejejdd	 eeZe	d
Zejdd e Z g dZ!h dZ"g dg dg dg dg dg ddZ#de$dee$e%f fddZ&de$de$de	fddZ'de(de$fdd Z)G d!d" d"Z*de$d#e$dee$ fd$d%Z+ed&kre,d' e,d( e,d' g d)Z-e Z.dZ/e0e-d*D ]e\Z1\Z2Z3e,d+d'  e,d,e1 d-e4e- d.e2  e,d/e3 d0 e,d'  e Z5e+e2e3Z6e e5 Z7e6r(e,d1e7d2d3 e,d4e6  e/d*7 Z/n	e,d5e7d2d3 e1e4e-k r=e8d* qe e. Z9e,d+d'  e,d6 e,d7e4e- d8 e,d9e/ d-e4e- d:e/e4e- d; d<d= e,d>e9d2d3 e,d?e9e4e- d2d@ e,d' dS dS )Aa  
================================================================================
 Storigo v10.0 - Ultra-Fast Accurate Bing Image Generator
 - Single-source Bing scraping with advanced accuracy
 - Multi-strategy semantic search with intelligent fallbacks
 - Enhanced copyright-free filtering
 - Parallel candidate validation for 3x speed improvement
 - No caching, no external APIs - Pure Bing excellence
================================================================================
    N)Path)OptionalDictListTuple)	urlencode)BeautifulSoup)ThreadPoolExecutoras_completedz?%(asctime)s - [%(levelname)s] - (StorigoImageGen) - %(message)s)levelformatgenerated_images_for_storigoT)exist_ok)zoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36zuMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36zPMozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0zeMozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36>0   aanasatbebyifinisofonortoandanyarebutcanforhadhasmaythewasbeenfromhavemanymoremostmuchsomethatthenthiswerewillwithbeingcouldmightshallthesethosewouldhavingshouldwithout)teamstaffpeoplepersonworkeremployeeprofessionalgroupcrowd
colleaguesmembers)	officemeetingbusiness	corporate	workspacedesk
conference	boardroom	executive)computerlaptopscreenmonitordata	analytics	dashboardchartgraphdigital
technology)	kitchen
restaurantchefcookingfooddiningmealculinarypreparation)	buildinginteriorexteriorroomhallspaceareafacilityvenue)working
discussing
presentingcollaborating	analyzingcreatingplanning
organizing)rA   rL   techrb   placesactionspromptreturnc                 C   sH  |    }| }td|}tdd|}tdd|}dd | D }g }g }i }|D ]+}	d}
t D ]\}}|	|v rR||	 |	|d	d
 ||< d}
 nq7|
sZ||	 q/g }t
t|d
 D ] }|| tvr||d
  tvr|||  d||d
    qeg }|r|d|d	 df |r|d	 }|r| d|d	  }|d|df |d|df |rt|dkr|d|d	  d|d
  df |r|d|d	  d|d	  df |d|d	 df |rd|dd }|d|df |s|dd|dd df ||dd  |dd  |dd |d!S )"zz
    Extract semantically meaningful keywords with intelligent prioritization
    Returns optimized search strategies
    z	"([^"]+)"z"[^"]+" z[^a-zA-Z0-9\s-] c                 S   s$   g | ]}t |d kr|tvr|qS )   )len	STOPWORDS).0t r   E/var/www/eduai.edurigo.com/storigo/testing/storigo_image_generator.py
<listcomp>]   s   $ z-extract_semantic_keywords.<locals>.<listcomp>Fr      Texact_quoted   phrase_enhanced_   phrase_pureZ   r   primary_duoU   primary_contextP   primary_singleK   N   full_contextF   original   <      )
strategiesprimarycontextphrases
categories)lowerstripsplitrefindallsubPRIORITY_PATTERNSitemsappendgetranger   r   join)r{   textoriginal_tokensquoted
text_cleantokensr   r   category_matchestokenmatchedcategorykeywordsr   ir   best_phraseenhanced_phraser   r   r   r   extract_semantic_keywordsO   sh   

 ""


r   	slide_keyqueryc                 C   sn   t dd| }d| dd }|sd}|dd }tt d d	 }|  d| d| d
}t| S )zGenerate clean, unique filenamez[^a-zA-Z0-9\s]r}   _Nr   image   i  i@B .jpg)r   r   r   r   r   inttime
IMAGES_DIR)r   r   clean_query	timestampfilenamer   r   r   generate_filename   s   r   
image_datac                 C   s@   |  drdS |  drdS t| dkr| dd dkrdS dS )	z#Determine MIME type from image data   z
image/jpeg   PNG

z	image/png         WEBPz
image/webp)
startswithr   )r   r   r   r   get_mime_type   s   

r   c                
   @   s   e Zd ZdZdd Zdededee fddZd	edefd
dZ	d	ede
eeef  fddZdedefddZddee dede
eeef  fddZdedede
e fddZdS )BingImageScraperz<High-performance Bing image scraper with parallel processingc              
   C   s:   t  | _| jjttddddddd t | _	d S )NzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.9zgzip, deflate, br1z
keep-alive)
User-AgentAcceptzAccept-LanguagezAccept-EncodingDNT
ConnectionzUpgrade-Insecure-Requests)
requestsSessionsessionheadersupdaterandomchoiceUSER_AGENTSset_tried_urls)selfr   r   r   __init__   s   
	zBingImageScraper.__init__r   strategy_namer|   c                 C   s  z|dddddd}dt | }tt| jjd< d	| jjd
< | jj|dd}|jdkr9t	d|j  g W S t
|jd}g }|dddi}|dd D ]8}	z)|	d}
|
ryt|
}|dpg|d}|ry| |ry|| jvry|| W qO tjttfy   Y qOw t|dk r|dddi}|dd D ]}|dp|d}|r| |r|| jvr|| qt	d| dt| d| d  |dd! W S  tjy   t	d"|  g  Y S  ty
 } zt	d#| d$t|dd%   g W  Y d}~S d}~ww )&zo
        Search Bing images with copyright-free filter
        Returns list of high-quality image URLs
        z#+filterui:license-L2_L3_L4_L5_L6_L7IRFLTRr   50Moderate)qqftformfirstcount
safeSearchz#https://www.bing.com/images/search?r   zhttps://www.bing.com/Referer
   timeout   u     ⚠️ Bing returned status zhtml.parserr   classiuscN#   mmurlturlimgmimg   srczdata-srcu     → z: Found z URLs for ''   u     ⚠️ Timeout for u     ⚠️ Error in : r   )r   r   r   r   r   r   r   status_codeloggerdebugr   r   find_alljsonloads_is_valid_image_urlr   r   JSONDecodeErrorAttributeErrorKeyErrorr   r   Timeout	Exceptionstr)r   r   r   params
search_urlresponsesoup
image_urlsimage_containers	containerm_attrrW   img_urlimg_tagsr   r   er   r   r   _search_bing   s`   	





""zBingImageScraper._search_bingurlc                    s`   |r| ds	dS g d}|  t fdd|D }g d}t fdd|D }|o/| S )z'Validate image URL format and extensionhttpF)r   z.jpegz.pngz.webpc                 3       | ]}| v V  qd S Nr   )r   ext	url_lowerr   r   	<genexpr>      z7BingImageScraper._is_valid_image_url.<locals>.<genexpr>)faviconlogoiconavatar	thumbnailpixelc                 3   r  r  r   )r   termr  r   r   r    r  )r   r   r   )r   r  
valid_extshas_extensionblocked
is_blockedr   r  r   r  
  s   
z$BingImageScraper._is_valid_image_urlc              	   C   s   || j v rdS | j | z2| jj|dddd}|jdkr W dS |jdd }d|vr0W dS |j}| |s;W dS ||fW S  t	j
t	jfyM   Y dS  tyV   Y dS w )	z_
        Download image and validate quality
        Returns (image_data, url) or None
        Nr   T)r   allow_redirectsstreamr   zcontent-typer}   r   )r   addr   r   r   r   r   content_validate_qualityr   r  RequestExceptionr  )r   r  r  content_typerW   r   r   r   _download_and_validate  s0   



z'BingImageScraper._download_and_validaterW   c                 C   sp   |rt |dk r
dS t |dkrdS |dd dk}|dd dk}t |d	kr0|dd	 d
knd}|p7|p7|S )z&Validate image data quality and formati:  Fi Nr   r   r   r   r   r   )r   )r   rW   jpegpngwebpr   r   r   r.  D  s    z"BingImageScraper._validate_qualityr   urlsmax_workersc           
   
      s   t |dg  fdd|dd D }t|ddD ]I}z>| }|rY|\}}t| }|tvrLt| |D ]}	|	  q9|W   W  d   S t	
d|dd	  d
 W q tyc   Y qw W d   dS 1 sow   Y  dS )z
        Download multiple URLs in parallel for speed
        Returns first successful download that is not a duplicate
        )r6  c                    s   i | ]
}  j||qS r   )submitr1  )r   r  executorr   r   r   
<dictcomp>Y  s    z7BingImageScraper._parallel_download.<locals>.<dictcomp>N   r   r   u3     → Parallel download got duplicate image (hash: r   z), continuing...)r	   r
   resulthashlibmd5	hexdigestUSED_IMAGE_HASHESr,  cancelr   r   r  )
r   r5  r6  futuresfuturer<  r   
source_url
image_hashfr   r8  r   _parallel_downloadS  s4   



z#BingImageScraper._parallel_downloadr   r{   c              
   C   s  zft |}|d }td| dt| d td|d   g }|dd D ];\}}}td	| d
| d| d | ||}	|	rS|	D ]
}
||
||f qHtdd |D dkr` ntd q*|stt	d| d W dS |j
dd dd g }t }|D ]\}
}}|
|vr||
 ||
 qtdt| d td | |}|r|\}}t|}t|d}d| d| }t|}td| d td|d d!|d" d#d$ td% td&|  |W S td' |dd( D ]\}
| |
}|rV|\}}t| }|tv r'td)|dd*  d+ qt| t|}t|d}d| d| }t|}td| d,|d d- |  W S td. qt	d| d/ W dS  ty } ztd0| d1t|  W Y d}~dS d}~ww )2z
        Main entry point: Fetch accurate image from Bing

        Args:
            slide_key: Slide identifier
            prompt: Visualization description

        Returns:
            Image URL path or None
        r   u   🔍 [z] Searching Bing with z strategiesz   Primary keywords: r   Nr   u     → Strategy 'z': 'z' (priority: )c                 S   s   g | ]
}|d  dkr|qS )r   r   r   )r   cr   r   r   r     s    z0BingImageScraper.fetch_image.<locals>.<listcomp>r   g333333?u   ⚠️ [z] No image candidates foundc                 S   s   | d S )Nr   r   )xr   r   r   <lambda>  s    z.BingImageScraper.fetch_image.<locals>.<lambda>T)keyreversez	   Found z unique candidatesu%     ⚡ Attempting parallel download...zutf-8zdata:z;base64,u   ✅ [z,] Image downloaded and encoded successfully!z	   Size: ,z bytes (i   .1fz KB)z    Source: Bing (copyright-free)z   Format: u#     → Trying sequential download...r   u5     → Sequential download got duplicate image (hash: r   z), skipping...z ] Image downloaded and encoded: z bytesg?z(] No valid images found after validationu   ❌ [z	] Error: )r   r   infor   r   r  r   r   sleepwarningsortr   r,  rG  r   base64	b64encodedecoder1  r=  r>  r?  r@  r  errorr	  )r   r   r{   keywords_datar   all_candidatesr   r   priorityr5  r  unique_urlsseenstrategyr<  r   rD  	mime_typebase64_datadata_uri	file_sizerE  r  r   r   r   fetch_imagep  s   



 





zBingImageScraper.fetch_imageN)r   )__name__
__module____qualname____doc__r   r	  r   r  boolr  r   r   bytesr1  r.  r   rG  rb  r   r   r   r   r      s    B((r   visualization_promptc                 C   s   t  }|| |S )a  
    PUBLIC API: Fetch accurate copyright-free image from Bing and return as base64 data URI

    Args:
        slide_key: Slide identifier (e.g., "slide_1")
        visualization_prompt: The visualization suggestion text

    Returns:
        Base64 data URI (e.g., "...") or None

    Example:
        >>> data_uri = fetch_image_for_slide("slide_1", "kitchen staff in commercial kitchen")
        >>> print(data_uri)  # "..."
    )r   rb  )r   ri  scraperr   r   r   fetch_image_for_slide  s   rk  __main__zP================================================================================u:   🧪 Testing Storigo Ultra-Fast Bing Image Generator v10.0))slide_1z+kitchen staff working in commercial kitchen)slide_2z+data analytics dashboard with charts graphs)slide_3z,business team collaboration in modern office)slide_4z.professional chef preparing food in restaurant)slide_5z1corporate meeting with people discussing strategyr   
u
   📸 Test /r   z   Prompt: 'r   u      ✅ SUCCESS in z.2fsz   URL: u      ❌ FAILED after u   📊 Test Results:z
   Total: z testsz   Success: z (r   rO  z%)z   Total time: z   Average: zs per image):rf  r   r=  r   loggingr   osrT  pathlibr   typingr   r   r   r   urllib.parser   r  bs4r   r   concurrent.futuresr	   r
   basicConfigINFO	getLoggerrc  r   r   mkdirr   r@  r   r   r   r	  r   r   r   rh  r   r   rk  print
test_casestotal_startsuccess_count	enumerateidxr   r{   r   
start_timer  elapsedrQ  
total_timer   r   r   r   <module>   s    


N  '




.