o
    ڜ,hB8                     @   sz  d Z ddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZ ddlZddlmZmZmZ ddlmZmZmZmZ ddlmZ ddlmZ dd	l m!Z!m"Z"m#Z#m$Z$ dd
l%m&Z&m'Z'm(Z( ddl)m*Z* e+  e,e-Z.e/dZ0dee1e2f de1fddZ3d3dede4dee1 ddfddZ5	d4dedede6ddfddZ7		d5dededee1 de6ddf
ddZ8				d6dede1d e	d!e
e9 d"e1d#e6dee1 d$ee1 ddfd%d&Z:e&d'Z;e&d(Z<de1d"e1d$e1ddfd)d*Z=					d7dede1d ee9 d!e
e9 d"e1d#e6dee1 d$ee1 de6ddfd+d,Z>defd-d.Z?d3d/eee1  ddfd0d1Z@e-d2kr;e@  dS dS )8z#Extract pdf structure in XML format    N)ArgumentParser)	Any	ContainerDictIterableListOptionalTextIOUnioncast)PDFDocumentPDFNoOutlinesPDFXRefFallback)
PDFIOErrorPDFObjectNotFoundPDFTypeErrorPDFValueError)PDFPage)	PDFParser)	PDFObjRef	PDFStreamresolve1stream_value)LIT	PSKeyword	PSLiteral)isnumberz&[\000-\037&<>()"\042\047\134\177-\377]sreturnc                 C   s*   t | trt| d}n| }tdd |S )Nzlatin-1c                 S   s   dt | d S )Nz&#%d;r   )ordgroup)m r"   /var/www/eduai.edurigo.com/doc_train/edurigo_ai/Puru/ai4bharat/ai4bharat_env/lib/python3.10/site-packages/../../../bin/dumppdf.py<lambda>$   s    zescape.<locals>.<lambda>)
isinstancebytesstrESC_PATsub)r   usr"   r"   r#   escape   s   
r+   outobjcodecc                 C   s  |d u r|  d d S t|tr?|  dt|  | D ]\}}|  d|  |  d t| | |  d q|  d d S t|trc|  dt|  |D ]}t| | |  d qO|  d	 d S t|ttfry|  d
t|t	|f  d S t|t
r|dkr|  |  d S |dkr|  |  d S |  d t| |j |  d |dkr| }|  dt|t	|f  |  d d S t|tr|  d|j  d S t|tr|  d|j  d S t|tr|  d|j  d S t|r|  d|  d S t|)Nz<null />z<dict size="%d">
z<key>%s</key>
z<value>z	</value>
z</dict>z<list size="%d">

z</list>z<string size="%d">%s</string>rawbinaryz<stream>
<props>
z

</props>
textz<data size="%d">%s</data>
z	</stream>z<ref id="%d" />z<keyword>%s</keyword>z<literal>%s</literal>z<number>%s</number>)writer%   dictlenitemsdumpxmllistr'   r&   r+   r   get_rawdataget_dataattrsr   objidr   namer   r   r   )r,   r-   r.   kvdatar"   r"   r#   r7   '   sd   








	





r7   Fdocshow_fallback_xrefc                 C   sr   |j D ]}t|tr|r| d t| |  | d qtdd |j D }|r5|s7d}t| d S d S d S )Nz
<trailer>
z
</trailer>

c                 s   s    | ]}t |tV  qd S N)r%   r   ).0xrefr"   r"   r#   	<genexpr>r   s    zdumptrailers.<locals>.<genexpr>zThis PDF does not have an xref. Use --show-fallback-xref if you want to display the content of a fallback xref that contains all objects.)	xrefsr%   r   r3   r7   get_trailerallloggerwarning)r,   rA   rB   rE   no_xrefsmsgr"   r"   r#   dumptrailersh   s   


rN   c           	      C   s   t  }| d |jD ]K}| D ]D}||v rq|| z ||}|d u r)W q| d|  t| ||d | d W q tyU } ztd|  W Y d }~qd }~ww qt	| || | d d S )Nz<pdf>z<object id="%d">
r.   z
</object>

znot found: %rz</pdf>)
setr3   rG   
get_objidsaddgetobjr7   r   printrN   )	r,   rA   r.   rB   visitedrE   r<   r-   er"   r"   r#   dumpallobjs|   s*   



rW    outfpfnameobjidspagenospassworddumpall
extractdirc                    s~  t |d}t|}	t|	| dd tt dD }
dtdtf fdd}z  }| 	d	 |D ]r\}}}}}d }|rI||}|
|d
 j
 }n(|rq|}t|trq|d}|rqt|dkrq|drq||d }|
|d
 j
 }t|}| 	d|d| d |d ur| 	d t| | | 	d |d ur| 	d|  | 	d q2| 	d W n	 ty   Y nw |	  |  d S )Nrbc                 S   s   i | ]\}}|j |qS r"   )pageid)rD   pagenopager"   r"   r#   
<dictcomp>   s    zdumpoutline.<locals>.<dictcomp>   destr   c                    s`   t | ttfrt | } nt | trt | j} t | tr%| d } t | tr.| 	 } | S )ND)
r%   r'   r&   r   get_destr   r=   r4   r   resolve)rf   rA   r"   r#   resolve_dest   s   


z!dumpoutline.<locals>.resolve_destz<outlines>
r   Sz/'GoTo'rg   z<outline level="z	" title="z">
z<dest>z</dest>
z<pageno>%r</pageno>
z</outline>
z</outlines>
)openr   r   	enumerater   create_pagesobjectr   get_outlinesr3   r<   r%   r4   getreprr+   r7   r   close)rY   rZ   r[   r\   r]   r^   r.   r_   fpparserpagesrk   outlinesleveltitlerf   aserb   actionsubtyper   r"   rj   r#   dumpoutline   sJ   








r   FilespecEmbeddedFilec           
         s   dt dtttf dd f fdd}t| dA}t|}t|| t } jD ](}|	 D ]!} 
|}	||vrNt|	trN|	dtu rN|| |||	 q-q'W d    d S 1 s[w   Y  d S )Nr<   r-   r   c                    s   t j|dptt|d }|d dp |d d} |j}t	|t
s4d| }t||dturAtd| t jd| |f }t j|rXtd| td	|  t jt j|d
d t|d}||  |  d S )NUFFEFz:unable to process PDF: reference for %r is not a PDFStreamTypez>unable to process PDF: reference for %r is not an EmbeddedFilez%.6d-%szfile exists: %rzextracting: %rT)exist_okwb)ospathbasenamerr   r   r&   decoderS   r<   r%   r   r   LITERAL_EMBEDDEDFILEjoinexistsr   rT   makedirsdirnamerm   r3   r:   rt   )r<   r-   filenamefilereffileobj	error_msgr   r,   rA   r_   r"   r#   extract1   s.   &

z!extractembedded.<locals>.extract1r`   r   )intr   r'   r   rm   r   r   rP   rG   rQ   rS   r%   r4   rr   LITERAL_FILESPECrR   )
rZ   r]   r_   r   ru   rv   extracted_objidsrE   r<   r-   r"   r   r#   extractembedded   s$   $




"r   c	                 C   s   t |d}	t|	}
t|
|}|r!|D ]}||}t| ||d q|rMtt|D ]"\}}||v rL|rF|jD ]}t	|}t| ||d q7q*t| |j
 q*|rVt| ||| |sb|sb|sbt| || |	  |dvrq| d d S d S )Nr`   rO   )r0   r1   r/   )rm   r   r   rS   r7   rn   r   ro   contentsr   r;   rW   rN   rt   r3   )rY   rZ   r[   r\   r]   r^   r.   r_   rB   ru   rv   rA   r<   r-   rb   rc   r"   r"   r#   dumppdf   s2   



r   c                  C   sd  t tdd} | jdtd ddd | jddd	d
tj d | jdddddd |  }|jdddddd |jddtdd | jddd}|jdtd ddd |jddtd d |jd!d"td#d |jd$d%ddd&d |jd'dd(d) |jd*d+td,d-d. | jd/d0d}|jd1d2td3d4d. | }|jd5d6ddd7d |jd8d9ddd:d |jd;d<ddd=d | S )>NT)descriptionadd_helpfiles+zOne or more paths to PDF files.)typedefaultnargshelpz	--versionz-vversionzpdfminer.six v)r}   r   z--debugz-dF
store_truezUse debug logging level.)r   r}   r   z--extract-tocz-TzExtract structure of outlinez--extract-embeddedz-EzExtract embedded files)r   r   ParserzUsed during PDF parsing)r   z--page-numbersz0A space-seperated list of page numbers to parse.z	--pagenosz-pzA comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.z	--objectsz-iz1Comma separated list of object numbers to extractz--allz-az3If the structure of all objects should be extractedz--show-fallback-xrefzAdditionally show the fallback xref. Use this if the PDF has zero or only invalid xref's. This setting is ignored if --extract-toc or --extract-embedded is used.)r}   r   z
--passwordz-PrX   z,The password to use for decrypting PDF file.)r   r   r   OutputzUsed during output generation.z	--outfilez-o-zJPath to file where output is written. Or "-" (default) to write to stdout.z--raw-streamz-rz%Write stream objects without encodingz--binary-streamz-bz)Write stream objects with binary encodingz--text-streamz-tz"Write stream objects as plain text)	r   __doc__add_argumentr'   pdfminer__version__add_mutually_exclusive_groupadd_argument_groupr   )rv   procedure_parserparse_paramsoutput_paramscodec_parserr"   r"   r#   create_parser#  s   
r   argvc           	      C   sB  t  }|j| d}|jrt tj |jdkrtj	}nt
|jd}|jr2dd |jdD }ng }|jr@dd |jD }n|jrOd	d |jdD }nt }|j}|jr[d
}n|jrad}n|jrgd}nd }|jD ].}|jrt||||||j|d d ql|jrt|||jd qlt||||||j|d |jd	 ql|  d S )N)argsr   wc                 S   s   g | ]}t |qS r"   r   rD   xr"   r"   r#   
<listcomp>      zmain.<locals>.<listcomp>,c                 S   s   h | ]}|d  qS re   r"   r   r"   r"   r#   	<setcomp>  r   zmain.<locals>.<setcomp>c                 S   s   h | ]}t |d  qS r   r   r   r"   r"   r#   r     s    r0   r1   r2   )r]   r^   r.   r_   )r]   r_   )r]   r^   r.   r_   rB   )r   
parse_argsdebuglogging	getLoggersetLevelDEBUGoutfilesysstdoutrm   objectssplitpage_numbersr\   rP   r]   
raw_streambinary_streamtext_streamr   extract_tocr   rI   extract_embeddedr   r   rB   rt   )	r   rv   r   rY   r[   r\   r]   r.   rZ   r"   r"   r#   main  sb   


r   __main__rC   )F)NF)rX   FNN)rX   FNNF)Ar   r   os.pathr   rer   argparser   typingr   r   r   r   r   r   r	   r
   r   r   pdfminer.pdfdocumentr   r   r   pdfminer.pdfexceptionsr   r   r   r   pdfminer.pdfpager   pdfminer.pdfparserr   pdfminer.pdftypesr   r   r   r   pdfminer.psparserr   r   r   pdfminer.utilsr   basicConfigr   __name__rJ   compiler(   r'   r&   r+   rp   r7   boolrN   rW   r   r   r   r   r   r   r   r   r"   r"   r"   r#   <module>   s   ,

 D

	
<.	

${
A
