c,ddlmZddlmZddlmZddlmZmZm Z m Z m Z m Z m Z ddlmZddlmZmZmZGdd ZGd d Ze eefZe eZGd d ZdS))aliases)sha256)dumps)AnyDictIteratorListOptionalTupleUnion)TOO_BIG_SEQUENCE) iana_nameis_multi_byte_encoding unicode_rangeceZdZ d%dededededddeef d Zd e d efd Z d e d efd Z e d efdZ d efdZd efdZd&dZe d efdZe d eefdZe d efdZe d efdZe d eefdZe d efdZe d efdZe d efdZe d efdZe d efdZe d efdZe d edfdZe d efdZe d eefdZe d eefd Z d'd"ed efd#Z!e d efd$Z"dS)( CharsetMatchNpayloadguessed_encodingmean_mess_ratiohas_sig_or_bom languagesCoherenceMatchesdecoded_payloadc||_||_||_||_||_d|_g|_d|_d|_d|_ ||_ dS)N) _payload _encoding_mean_mess_ratio _languages_has_sig_or_bom_unicode_ranges_leaves_mean_coherence_ratio_output_payload_output_encoding_string)selfrrrrrrs ;/usr/lib/python3/dist-packages/charset_normalizer/models.py__init__zCharsetMatch.__init__ s[ ' .'6,5%348+- ,/"04/3&5 otherreturnct|tsGtdt |jt |j|j|jko|j|jkS)Nz&__eq__ cannot be invoked on {} and {}.) isinstancer TypeErrorformatstr __class__encoding fingerprintr(r,s r)__eq__zCharsetMatch.__eq__$ss%.. 8??((#dn*=*=  }.X43CuGX3XXr+cNt|tstt|j|jz }t|j|jz }|dkr<|dkr6|dkr |j|jkr|j|jkS|j|jkS|j|jkS)zQ Implemented to make sorted available upon CharsetMatches items. g{Gz?g{Gz?r)r/r ValueErrorabschaos coherencemulti_byte_usage)r(r,chaos_differencecoherence_differences r)__lt__zCharsetMatch.__lt__-s%..  "%dj5;&>"?"?&)$.5?*J&K&K d " "';d'B'B3&&4>U_+L+L,u/EEE>EO3 3zEK''r+cjdtt|t|jz z S)Ng?)lenr2rawr(s r)r=zCharsetMatch.multi_byte_usage@s&ST^^c$(mm333r+c^|j t|j|jd|_|jS)Nstrict)r'r2rrrDs r)__str__zCharsetMatch.__str__Ds) < t}dnhGGDL|r+cBd|j|jS)Nz)r1r4r5rDs r)__repr__zCharsetMatch.__repr__Js.55dmTEUVVVr+ct|tr||kr'td|jd|_|j|dS)Nz;Unable to add instance <{}> as a submatch of a CharsetMatch)r/rr9r1r3r'r#appendr6s r) add_submatchzCharsetMatch.add_submatchMsk%.. %4--MTTO    E"""""r+c|jSN)rrDs r)r4zCharsetMatch.encodingXs ~r+cg}tjD]F\}}|j|kr||&|j|kr||G|S)z Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. )ritemsr4rK)r( also_known_asups r)encoding_aliaseszCharsetMatch.encoding_aliases\sn $& MOO ( (DAq}!!$$Q''''!##$$Q'''r+c|jSrNr!rDs r)bomzCharsetMatch.bomi ##r+c|jSrNrVrDs r)byte_order_markzCharsetMatch.byte_order_markmrXr+c$d|jDS)z Return the complete list of possible languages found in decoded sequence. Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. cg|] }|d S)r).0es r) z*CharsetMatch.languages..ws...!...r+r rDs r)rzCharsetMatch.languagesqs /.do....r+c|jshd|jvrdSddlm}m}t |jr||jn||j}t|dksd|vrdS|dS|jddS)z Most probable language found in decoded sequence. If none were detected or inferred, the property will return "Unknown". asciiEnglishr)encoding_languagesmb_encoding_languagesz Latin BasedUnknown)r could_be_from_charsetcharset_normalizer.cdrerfrr4rB)r(rerfrs r)languagezCharsetMatch.languageys  $444 y X W W W W W W W*$-887%%dm444'' 66  9~~""my&@&@ yQ< q!!$$r+c|jSrN)rrDs r)r;zCharsetMatch.chaoss $$r+c:|jsdS|jddS)Nrrr rarDs r)r<zCharsetMatch.coherences# 3q!!$$r+c4t|jdzdSNd)ndigits)roundr;rDs r) percent_chaoszCharsetMatch.percent_chaossTZ#%q1111r+c4t|jdzdSrn)rrr<rDs r)percent_coherencezCharsetMatch.percent_coherencesT^c)15555r+c|jS)z+ Original untouched bytes. )rrDs r)rCzCharsetMatch.raws }r+c|jSrN)r#rDs r)submatchzCharsetMatch.submatchs |r+c2t|jdkSNr)rBr#rDs r) has_submatchzCharsetMatch.has_submatchs4<  1$$r+c|j|jSdt|D}ttd|D|_|jS)Nc,g|]}t|Sr])r)r^chars r)r`z*CharsetMatch.alphabets..s-0 0 0 $(M$  0 0 0 r+ch|]}||Sr]r])r^rs r) z)CharsetMatch.alphabets..s+L+L+L!!+LA+L+L+Lr+)r"r2sortedlist)r(detected_rangess r) alphabetszCharsetMatch.alphabetssj   +' '0 0 ,/II0 0 0  &d+L+L+L+L+L&M&MNN##r+c6|jgd|jDzS)z The complete list of encoding that output the exact SAME str result and therefore could be the originating encoding. This list does include the encoding available in property 'encoding'. cg|] }|j Sr])r4)r^ms r)r`z6CharsetMatch.could_be_from_charset..s"D"D"D!1:"D"D"Dr+)rr#rDs r)rhz"CharsetMatch.could_be_from_charsets%"D"Dt|"D"D"DDDr+utf_8r4c|j |j|kr/||_t||d|_|jS)z Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. Any errors will be simply ignored by the encoder NOT replaced. Nreplace)r&r2encoder%)r(r4s r)outputzCharsetMatch.outputsI  (D,AX,M,M$,D !#&t99#3#3Hi#H#HD ##r+cht|S)zw Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. )rr hexdigestrDs r)r5zCharsetMatch.fingerprints& dkkmm$$..000r+rN)r,rr-N)r)#__name__ __module__ __qualname__bytesr2floatboolr r*objectr7r@propertyr=rGrIrLr4r rTrWrZrrjr;r<rsrurCrxr{rrhrr5r]r+r)rr s*.6666 6  6 & 6"#66662YFYtYYYY(F(t((((&4%444X4 W#WWWW # # # ##X $s)   X $T$$$X$$$$$X$/49///X/%#%%%X%6%u%%%X%%5%%%X% 2u222X265666X6UX $~.X%d%%%X% $49 $ $ $X $EtCyEEEXE $ $s $ $ $ $ $1S111X111r+rceZdZdZddeeefdZdeefdZ de e e fdefdZ de fd Zdefd Zdeddfd Zded fd Zded fdZdS)CharsetMatchesz Container with every CharsetMatch items ordered by default from most probable to the less one. Act like a list(iterable) but does not implements all related methods. Nresultsc6|rt|ng|_dSrN)r_results)r(rs r)r*zCharsetMatches.__init__s?F,NF7OOOB r+r-c#$K|jEd{VdSrNrrDs r)__iter__zCharsetMatches.__iter__s&=         r+itemct|tr |j|St|tr't |d}|jD]}||jvr|cSt )z Retrieve a single item either by its position or encoding name (alias may be used here). Raise KeyError upon invalid index or encoding not present in results. F)r/intrr2rrhKeyError)r(rresults r) __getitem__zCharsetMatches.__getitem__sv dC  '=& & dC  "T5))D- " "6777!MMM8r+c*t|jSrNrBrrDs r)__len__zCharsetMatches.__len__s4=!!!r+c2t|jdkSrzrrDs r)__bool__zCharsetMatches.__bool__s4=!!A%%r+ct|ts4tdt |jt |jtkrB|j D]:}|j |j kr(|j |j kr| |dS;|j |t|j |_ dS)z~ Insert a single match. Will be inserted accordingly to preserve sort. Can be inserted as a submatch. z-Cannot append instance '{}' to CharsetMatchesN)r/rr9r1r2r3rBrCrrr5r;rLrKr)r(rmatchs r)rKzCharsetMatches.appends $ -- ?FF''  tx==, , ,  $(888U[DJ=V=V&&t,,,FF T"""t}-- r+rc.|jsdS|jdS)zQ Simply return the first match. Strict equivalent to matches[0]. NrrrDs r)bestzCharsetMatches.bests } 4}Qr+c*|S)zP Redundant method, call the method best(). Kept for BC reasons. )rrDs r)firstzCharsetMatches.firstsyy{{r+rN)rrr__doc__r r rr*rrr rr2rrrrrKrrr]r+r)rrs0 OOl); <OOOO!(<0!!!! c3h L    """""&$&&&&.<.D....( h~.    x/r+rceZdZdedeedeedeededeededed ed eed efd Ze d e ee ffdZ d efdZ dS)CliDetectionResultpathr4rTalternative_encodingsrjrrr;r< unicode_path is_preferredc ||_| |_||_||_||_||_||_||_||_| |_ | |_ dSrN) rrr4rTrrjrrr;r<r) r(rr4rTrrjrrr;r<rrs r)r*zCliDetectionResult.__init__&s\ +7'/ +;0E"% $-$2! )".r+r-c |j|j|j|j|j|j|j|j|j|j |j d S)N rr4rTrrjrrr;r<rrrrDs r)__dict__zCliDetectionResult.__dict__@sOI $ 5%)%? "1Z - -   r+c0t|jddS)NT) ensure_asciiindent)rrrDs r)to_jsonzCliDetectionResult.to_jsonPsT]a@@@@r+N)rrrr2r r rrr*rrrrrr]r+r)rr%s//3-/s) / $Cy /  /9////sm/////4  $sCx.    X  AAAAAAAr+rN)encodings.aliasesrhashlibrjsonrtypingrrrr r r r constantrutilsrrrrrr2rCoherenceMatchrrr]r+r)rsZ%%%%%%DDDDDDDDDDDDDDDDDD&&&&&&CCCCCCCCCCQ1Q1Q1Q1Q1Q1Q1Q1h@@@@@@@@FsEz"',A,A,A,A,A,A,A,A,A,Ar+