
    <h\                        S SK r S SKrS SKrS SKrS SKrS SKrS SKJr  S SKrS SK	J
r
  S SKJr  SSKJr  SSKJr  \R"                  " \5      rSr " S	 S
\5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      rg)    N)Optional)FileLock)Dataset   )PreTrainedTokenizer)loggingu   This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: {0}c            
       h    \ rS rSrSr  SS\S\S\S\\   4S jjr	S	 r
S
\R                  4S jrSrg)TextDataset(   @
This will be superseded by a framework-agnostic approach soon.
N	tokenizer	file_path
block_size	cache_dirc           
      (   [         R                  " [        R                  S5      [        5        [
        R                  R                  U5      SL a  [        SU S35      eX1R                  SS9-
  n[
        R                  R                  U5      u  pg[
        R                  R                  Ub  UOUSUR                  R                   SU SU 35      nUS-   n	[        U	5         [
        R                  R                  U5      (       a~  U(       dw  [         R                   " 5       n
[#        US	5       n[$        R&                  " U5      U l        S S S 5        [*        R-                  S
U S3[         R                   " 5       U
-
  5        GO>[*        R-                  SU 35        / U l        [#        USS9 nUR/                  5       nS S S 5        UR1                  UR3                  W5      5      n[5        S[7        U5      U-
  S-   U5       H1  nU R(                  R9                  UR;                  XX-    5      5        M3     [         R                   " 5       n
[#        US5       n[$        R<                  " U R(                  U[$        R>                  S9  S S S 5        [*        R-                  SU S[         R                   " 5       U
-
  S S35        S S S 5        g ! , (       d  f       GN= f! , (       d  f       GN&= f! , (       d  f       Nn= f! , (       d  f       g = f)Nchttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.pyFInput file path 
 not foundpair
cached_lm__.lockrb"Loading features from cached file  [took %.3f s]'Creating features from dataset file at utf-8encodingr      wbprotocol!Saving features into cached file  [took .3f s]) warningswarnDEPRECATION_WARNINGformatFutureWarningospathisfile
ValueErrornum_special_tokens_to_addsplitjoin	__class____name__r   existstimeopenpickleloadexamplesloggerinforeadconvert_tokens_to_idstokenizerangelenappend build_inputs_with_special_tokensdumpHIGHEST_PROTOCOL)selfr   r   r   overwrite_cacher   	directoryfilenamecached_features_file	lock_pathstarthandleftexttokenized_textis                   d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/data/datasets/language_modeling.py__init__TextDataset.__init__-   s}    	&&u 		
 77>>)$-/	{*EFF"E"E5"E"QQ
 ggmmI6	!ww||".II,,556a
|1XJO 
 )72	i ww~~233O		.5$*KK$7DM 689M8Nn]_c_h_h_jmr_r
 Ei[QR ")g6!668D 7 "+!@!@ASASTXAY!Zq#n"5
"BQ"F
SAMM((!BB>VWVdCef T 		.5KKv@W@WX 678L7MWUYU^U^U`chUhilTmmpq; !  65 76 657 ! sW   $AL1KA#L0K BL/K2
;L
K	L 
K/	*L2
L 	<L
Lc                 ,    [        U R                  5      $ NrC   r<   rH   s    rT   __len__TextDataset.__len__j       4==!!    returnc                 b    [         R                  " U R                  U   [         R                  S9$ )Ndtype)torchtensorr<   longrH   rS   s     rT   __getitem__TextDataset.__getitem__m   s     ||DMM!,EJJ??r^   r<   )FN)r6   
__module____qualname____firstlineno____doc__r   strintr   rU   r[   rc   Tensorrg   __static_attributes__ r^   rT   r
   r
   (   sV     #';&; ; 	; C=;z"@ @r^   r
   c                   `    \ rS rSrSrS\S\S\4S jrS r	S\
\\R                  4   4S	 jrS
rg)LineByLineTextDatasetq   r   r   r   r   c           	         [         R                  " [        R                  S5      [        5        [
        R                  R                  U5      SL a  [        SU S35      e[        R                  SU 35        [        USS9 nUR                  5       R                  5        Vs/ sH-  n[        U5      S:  d  M  UR                  5       (       a  M+  UPM/     nnS S S 5        U" WS	S	US
9nUS   U l        U R                    Vs/ sH(  nS["        R$                  " U["        R&                  S90PM*     snU l        g s  snf ! , (       d  f       Nk= fs  snf )Nr   Fr   r   r   r   r   r   Tadd_special_tokens
truncation
max_length	input_idsra   )r)   r*   r+   r,   r-   r.   r/   r0   r1   r=   r>   r9   r?   
splitlinesrC   isspacer<   rc   rd   re   )	rH   r   r   r   rP   linelinesbatch_encodinges	            rT   rU   LineByLineTextDataset.__init__v   s   &&u 		
 77>>)$-/	{*EFF 	=i[IJ)g.!&'ffh&9&9&;f&;dD	ATVZVbVbVdT&;Ef / #5Td_ij&{3SWS`S`aS`a+u||AUZZ'HIS`a	 g /.
 bs0   !D=#D89D8D8D=.E8D==
Ec                 ,    [        U R                  5      $ rX   rY   rZ   s    rT   r[   LineByLineTextDataset.__len__   r]   r^   r_   c                      U R                   U   $ rX   ri   rf   s     rT   rg   !LineByLineTextDataset.__getitem__       }}Qr^   ri   Nr6   rj   rk   rl   rm   r   rn   ro   rU   r[   dictrc   rd   rg   rq   rr   r^   rT   rt   rt   q   sF    b"5 b# bSV b*" S%,,%6 7  r^   rt   c                   d    \ rS rSrSrS\S\S\S\4S jrS r	S	\
\\R                  4   4S
 jrSrg)LineByLineWithRefDataset   r   r   r   r   ref_pathc                    [         R                  " [        R                  S5      [        5        [
        R                  R                  U5      SL a  [        SU S35      e[
        R                  R                  U5      SL a  [        SU S35      e[        R                  SU 35        [        R                  SU 35        [        USS	9 nUR                  5       nS S S 5        W Vs/ sH;  n[        U5      S
:  d  M  UR                  5       (       a  M+  UR                  5       PM=     nn[        USS	9 nUR!                  5       R#                  5        Vs/ sHA  n[        U5      S
:  d  M  UR                  5       (       a  M+  [$        R&                  " U5      PMC     nnS S S 5        [        U5      [        W5      :w  a)  [        SU S[        U5       SU S[        U5       35      eU" USSUS9n	U	S   U l        U R(                   V
s/ sH(  n
S[*        R,                  " U
[*        R.                  S90PM*     sn
U l        [        U R(                  5      n[1        U5       H8  n[*        R,                  " X   [*        R.                  S9U R(                  U   S'   M:     g ! , (       d  f       GN= fs  snf s  snf ! , (       d  f       GN = fs  sn
f )Nzghttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.pyFr   r   zRef file path r   zUse ref segment results at r   r   r   zDLength of Input file should be equal to Ref file. But the length of z is z while length of Trw   r{   ra   chinese_ref)r)   r*   r+   r,   r-   r.   r/   r0   r1   r=   r>   r9   	readlinesrC   r}   stripr?   r|   jsonloadsr<   rc   rd   re   rB   )rH   r   r   r   r   rP   datar~   refr   r   nrS   s                rT   rU   !LineByLineWithRefDataset.__init__   sJ   &&y 		
 77>>)$-/	{*EFF77>>(#u,~i[
CDD 	=i[IJ1(<=)g.!;;=D /)-VTQt||~

V(W-010C0C0Ep0E#d)VW-#`d`l`l`n#4::d#0ECp .t9C VW`Vaaefijnfoep q##+*DS
< 
 #4DT^hi&{3SWS`S`aS`a+u||AUZZ'HIS`aqA.3ll36.TDMM!]+ # /.V q .- bsN   J(J.>J.J.5!J8J3,J3J3J8	.K

J+3J88
Kc                 ,    [        U R                  5      $ rX   rY   rZ   s    rT   r[    LineByLineWithRefDataset.__len__   r]   r^   r_   c                      U R                   U   $ rX   ri   rf   s     rT   rg   $LineByLineWithRefDataset.__getitem__   r   r^   ri   Nr   rr   r^   rT   r   r      sP    "U"5 "U# "USV "Ube "UH" S%,,%6 7  r^   r   c                   j    \ rS rSrSrS\S\S\4S jrSS jr	S r
S	\\\R                  4   4S
 jrSrg)LineByLineWithSOPTextDataset   zQ
Dataset for sentence order prediction task, prepare sentence pairs for SOP task
r   file_dirr   c                    [         R                  " [        R                  S5      [        5        [
        R                  R                  U5      SL a  [        U S35      e[        R                  SU 35        / U l        [
        R                  " U5       GH:  n[
        R                  R                  X$5      n[
        R                  R                  U5      SL a  [        U S35      eSn[        USS9 nUR!                  5       n/ n	U H  n
SU
;   a  S	nM  S
U
;   a  SnU	SS   V
s/ sHK  n
[#        U
5      S:  d  M  U
R%                  5       (       a  M+  UR'                  UR)                  U
5      5      PMM     nn
U R+                  XU5      nU R                  R-                  U5        / n	M  U(       d  M  U	R/                  U
5        M     S S S 5        GM=     [        R                  S5        g s  sn
f ! , (       d  f       GMj  = f)Nr   Fz is not a directoryz.Creating features from dataset file folder at z is not a filer   r   z<doc id=Tz</doc>r!   r   zDataset parse finished.)r)   r*   r+   r,   r-   r.   r/   isdirr1   r=   r>   r<   listdirr4   r0   r9   r   rC   r}   r@   rA   create_examples_from_documentextendrD   )rH   r   r   r   	file_namer   article_openrP   original_linesarticle_linesr~   documentr<   s                rT   rU   %LineByLineWithSOPTextDataset.__init__   s   &&u 		
 77=="e+z)<=>>DXJOP H-IX9Iww~~i(E1 I;n!=>> Li'2a!" "*D!T)'+!T)', )6ab(9$(9 #D	A V6:lln VI;;I<N<Nt<TU(9 ! $ $(#E#Eh\e#f,,X6(*'<)006! + 32 .4 	-.$ 32s0   31G.$G)
:G)
$G)
58G.1G.)G..
G>	c                 ,   X#R                  SS9-
  nUn[        R                  " 5       U:  a  [        R                  " SU5      n/ n/ nSn	Sn
U
[        U5      :  Ga8  X   nU(       d  U
S-  n
M"  UR	                  U5        U	[        U5      -  n	U
[        U5      S-
  :X  d  X:  Ga  U(       Ga  Sn[        U5      S:  a#  [        R                  " S[        U5      S-
  5      n/ n[        U5       H  nUR                  X   5        M     / n[        U[        U5      5       H  nUR                  X   5        M     [        U5      S:X  d  [        U5      S:X  a  GM  [        R                  " 5       S:  a  SnXpOSnS nU" XU5        [        U5      S:  d  [        S	[        U5       S
35      e[        U5      S:  d  [        S[        U5       S
35      eUR                  X5      nUR                  X5      n[        R                  " U[        R                  S9[        R                  " U[        R                  S9[        R                  " U(       a  SOS[        R                  S9S.nUR	                  U5        / nSn	U
S-  n
U
[        U5      :  a  GM8  U$ )'Creates examples for a single document.Tr      r   r!         ?Fc                     [        U 5      [        U5      -   nX2::  a  g[        U 5      [        U5      :  a  U OUn[        U5      S:  d  [        S5      e[        R                  " 5       S:  a  US	 OUR                  5         M  )z;Truncates a pair of sequences to a maximum sequence length.r!   z8Sequence length to be truncated must be no less than oner   r   N)rC   r1   randompop)tokens_atokens_bmax_num_tokenstotal_lengthtrunc_tokenss        rT   truncate_seq_pairULineByLineWithSOPTextDataset.create_examples_from_document.<locals>.truncate_seq_pair-  sx    "+.x=3x=+HL+= %7:8}s8}7T8ZbL$'$5$:&01k&l l  &}}4$0O , 0 0 2 #r^   Length of sequence a is  which must be no less than 1Length of sequence b is ra   )r{   token_type_idssentence_order_label)r2   r   randintrC   rD   rB   r   r1   rE   $create_token_type_ids_from_sequencesrc   rd   re   )rH   r   r   r   short_seq_probr   target_seq_lengthr<   current_chunkcurrent_lengthrS   segmenta_endr   jr   is_nextr   r{   r   examples                        rT   r   :LineByLineWithSOPTextDataset.create_examples_from_document   sa    $&I&It&I&TT +==?^+ &q. A #h-kGQ  )c'l*NCMA%%)L E=)Q. &q#m2Dq2H I!H"5\ (89 *  "H"5#m*<= (89 > 8})S]a-?  }},"'-5("&3  &h.IMQ.(+CCM?Ro)pqqMQ.(+CCM?Ro)pqq !* J J8 ^I%.%S%ST\%gN &+\\)5::%N*/,,~UZZ*X05'QqX]XbXb0cG
 OOG, "!"FAM #h-N r^   c                 ,    [        U R                  5      $ rX   rY   rZ   s    rT   r[   $LineByLineWithSOPTextDataset.__len__S  r]   r^   r_   c                      U R                   U   $ rX   ri   rf   s     rT   rg   (LineByLineWithSOPTextDataset.__getitem__V  r   r^   ri   N)皙?)r6   rj   rk   rl   rm   r   rn   ro   rU   r   r[   r   rc   rd   rg   rq   rr   r^   rT   r   r      sJ    '/"5 '/ '/RU '/RaF" S%,,%6 7  r^   r   c                   f    \ rS rSrSr   SS\S\S\4S jjrS\	\	\      S\S\4S	 jr
S
 rS rSrg)$TextDatasetForNextSentencePredictioniZ  r   r   r   r   c           	      N   [         R                  " [        R                  S5      [        5        [
        R                  R                  U5      (       d  [        SU S35      eXPl	        X`l
        [
        R                  R                  U5      u  px[
        R                  R                  USUR                  R                   SU SU 35      n	Xl        U	S-   n
[!        U
5         [
        R                  R#                  U	5      (       a~  U(       dw  [$        R$                  " 5       n['        U	S5       n[(        R*                  " U5      U l        S S S 5        [.        R1                  SU	 S	3[$        R$                  " 5       U-
  5        GO[.        R1                  S
U 35        / /U l        ['        USS9 n UR5                  5       nU(       d  OUR7                  5       nU(       d7  [9        U R2                  S   5      S:w  a  U R2                  R;                  / 5        UR=                  U5      nUR?                  U5      nU(       a  U R2                  S   R;                  U5        M  S S S 5        [.        R1                  S[9        U R2                  5       S35        / U l        [A        U R2                  5       H  u  nnU RC                  UUU5        M     [$        R$                  " 5       n['        U	S5       n[(        RD                  " U R,                  U[(        RF                  S9  S S S 5        [.        R1                  SU	 S[$        R$                  " 5       U-
  S S35        S S S 5        g ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       Nn= f! , (       d  f       g = f)Nr   r   r   cached_nsp_r   r   r   r   r   r   r   r   r   zCreating examples from z documents.r"   r#   r%   r&   r'   r(   )$r)   r*   r+   r,   r-   r.   r/   r0   r1   short_seq_probabilitynsp_probabilityr3   r4   r5   r6   r   r   r7   r8   r9   r:   r;   r<   r=   r>   	documentsreadliner   rC   rD   rA   r@   	enumerater   rF   rG   )rH   r   r   r   rI   r   r   rJ   rK   rL   rM   rN   rO   rP   r~   tokens	doc_indexr   s                     rT   rU   -TextDatasetForNextSentencePrediction.__init___  s    	&&u 		
 ww~~i((/	{*EFF%:". ggmmI6	!ww||)--667qAhZP 

 # )72	 i ww~~233O		.5$*KK$7DM 689M8Nn]_c_h_h_jmr_r Ei[QR"$)g6! zz|#!#zz|  $DNN2,>(?1(D NN11"5!*!3!3D!9!*!@!@!H! NN2.55f=  7 5c$..6I5J+VW "+4T^^+D'Ix66xJW ,E 		.5KKv@W@WX 678L7MWUYU^U^U`chUhilTmmpqG !  65 76* 65C ! sX   "AN/M!A$N/B1M3 BN./N;N!
M0	+N3
N	=N
N	N
N$r   r   c                    X0R                   R                  SS9-
  nUn[        R                  " 5       U R                  :  a  [        R                  " SU5      n/ nSnSnU[        U5      :  Ga  X   n	UR                  U	5        U[        U	5      -  nU[        U5      S-
  :X  d  Xu:  Ga  U(       Ga  Sn
[        U5      S:  a#  [        R                  " S[        U5      S-
  5      n
/ n[        U
5       H  nUR                  Xl   5        M     / n[        U5      S:X  d#  [        R                  " 5       U R                  :  a  SnU[        U5      -
  n[        S5       H8  n[        R                  " S[        U R                  5      S-
  5      nUU:w  d  M8    O   U R                  W   n[        R                  " S[        U5      S-
  5      n[        U[        U5      5       H(  nUR                  UU   5        [        U5      U:  d  M(    O   [        U5      U
-
  nUU-  nO1Sn[        U
[        U5      5       H  nUR                  Xl   5        M     [        U5      S:  d  [        S[        U5       S	35      e[        U5      S:  d  [        S
[        U5       S	35      eU R                   R                  X5      nU R                   R                  X5      n[        R                  " U[        R                   S9[        R                  " U[        R                   S9[        R                  " U(       a  SOS[        R                   S9S.nU R"                  R                  U5        / nSnUS-  nU[        U5      :  a  GM  gg)r   Tr   r   r   r!   
   Fr   r   r   ra   )r{   r   next_sentence_labelN)r   r2   r   r   r   rC   rD   rB   r   r   r   r1   rE   r   rc   rd   re   r<   )rH   r   r   r   r   r   r   r   rS   r   r   r   r   r   is_random_nexttarget_b_lengthr   random_document_indexrandom_documentrandom_startnum_unused_segmentsr{   r   r   s                           rT   r   BTextDatasetForNextSentencePrediction.create_examples_from_document  s    $nn&N&NTX&N&YY +==?T777 &q. A#h-kG  )c'l*NCMA%%)L  E=)Q. &q#m2Dq2H I!H"5\ (89 *  "H=)Q.&--/DDXDX2X)-*;c(m*K "'rA4:NN1c$..FY\]F]4^14	A % "+
 +/..9N*O'-~~a_9MPQ9Q'R!&|S5I!JA$OOOA,>?"8}? % "K /2-.@5.H+00 */!&uc-.@!AA$OOM,<= "B  MQ.(+CCM?Ro)pqqMQ.(+CCM?Ro)pqq !% O OPX cI%)^^%X%XYa%lN &+\\)5::%N*/,,~UZZ*X/4||AUV^c^h^h/iG MM((1 "!"FAI #h-r^   c                 ,    [        U R                  5      $ rX   rY   rZ   s    rT   r[   ,TextDatasetForNextSentencePrediction.__len__  r]   r^   c                      U R                   U   $ rX   ri   rf   s     rT   rg   0TextDatasetForNextSentencePrediction.__getitem__  r   r^   )r   r<   r   r   r   N)Fr   r   )r6   rj   rk   rl   rm   r   rn   ro   rU   listr   r[   rg   rq   rr   r^   rT   r   r   Z  sk     !S&S S 	SjXd49o XRU Xcf Xt" r^   r   )r   r.   r:   r   r8   r)   typingr   rc   filelockr   torch.utils.datar   tokenization_utilsr   utilsr   
get_loggerr6   r=   r+   r
   rt   r   r   r   rr   r^   rT   <module>r      s     	        $ 5  
		H	%L F@' F@R G  B- w - `U 7 U px 7 x r^   