
    <h                     <   S r SSKrSSKrSSKJrJrJr  SSKrSSKrSSKJ	r	  SSK
JrJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJrJrJ r   SSK!J"r"J#r#J$r$  SSK%J&r&  \#RN                  " \(5      r)S r* " S S\	RV                  5      r, " S S\	RV                  5      r- " S S\	RV                  5      r. " S S\5      r/\" " S S\5      5       r0\" " S S\05      5       r1\"" SS9 " S  S!\0\5      5       r2\"" S"S9 " S# S$\05      5       r3/ S%Qr4g)&zPyTorch OpenAI ImageGPT model.    N)AnyOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_conv1d_layer)auto_docstringloggingtorch_float   )ImageGPTConfigc                 $    SSK nSSKn[
        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n/ nU Hk  u  p[        R                  SU	 SU
 35        UR                  R                  XY5      nUR                  U	5        UR                  UR                  5       5        Mm     [        Xx5       GH  u  pU	SS n	U	R                  S5      n	[!        S	 U	 5       5      (       d	  U	S
   S;   a5  [        R                  SR#                  SR%                  U	5      5      5        Mq  U nU	S
   S;  a  ['        US5      nU	 GHg  nUR)                  SU5      (       a  UR                  SU5      nOU/nUS   S:X  d	  US   S:X  a  ['        US5      nOUS   S:X  a  ['        US5      nOUS   S:X  d	  US   S:X  a  ['        XS   5      n['        US5      nOUS   S;   a  ['        US5      n['        US5      nO[+        U	5      S:X  a-  U	S   S:X  a$  US   S:X  a  ['        XS   5      n['        US5      nORUS   S:X  a  ['        US5      n['        US5      nO0US   S :X  a  ['        US5      n['        US5      nO['        XS   5      n[+        U5      S!:  d  GMU  [-        US   5      nX   nGMj     [+        U	5      S:  a	  U	S   S:X  d  U	S
   S:X  d  U	S
   S :X  d	  U	S
   S:X  a  O UR.                  UR.                  :X  d   e [        R                  S"U	 35        U	S
   S#:X  ad  [4        R6                  " UR9                  UR:                  UR:                  5      5      R<                  UR>                  SS2SUR:                  24'   GM  U	S
   S$:X  aq  [4        R6                  " UR9                  UR:                  UR:                  5      5      R<                  UR>                  SS2UR:                  S!UR:                  -  24'   GMG  U	S
   S%:X  ag  [4        R6                  " UR9                  UR:                  UR:                  5      5      R<                  UR>                  SS2S!UR:                  -  S24'   GM  [+        U	5      S:X  aT  U	S   S:X  aK  U	S!   S:X  aB  [4        R6                  " UR9                  UR:                  UR:                  5      5      Ul        GM  U	S
   S:X  a  [4        R6                  " U5      Ul        GMA  U	S
   S:X  a9  [4        R6                  " U5      UR>                  SUR@                  S-
  2SS24'   GM  U	S
   S :X  a&  [4        R6                  " U5      UR>                  S
'   GM  [4        R6                  " U5      Ul        GM     U $ ! [         a    [        R	                  S5        e f = f! [0         a1  nU=R2                  UR.                  UR.                  4-  sl        e SnAff = f)&z(
Load tf checkpoints in a pytorch model
r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape    /c              3   *   #    U H
  nUS ;   v   M     g7f))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/imagegpt/modeling_imagegpt.py	<genexpr>.load_tf_weights_in_imagegpt.<locals>.<genexpr>Q   s      
 nns   )_stepzSkipping {})wtettransformerz[A-Za-z]+\d+z(\d+)wgweightbbiaswpewte)q_projk_projv_projc_attnr
   r   attnc_projr-   lm_headsos   zInitialize PyTorch weight r6   r7   r8   )!re
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendsqueezezipsplitanyformatjoingetattr	fullmatchlenintshapeAssertionErrorargstorch
from_numpyreshapen_embdTdata
vocab_size)modelconfigimagegpt_checkpoint_pathr?   tftf_path	init_varsnamesarraysnamerV   arraypointerm_namescope_namesnumes                    r(   load_tf_weights_in_imagegptro   0   s`   	 ggoo67G
KK8	BC''0IEF (l5'BC&&w5Temmo&	 ! 5)ABxzz#  

 
 
 "X"KK,,SXXd^<=88#g}5GF||OV44 hhx8%h1~$A#(=!'84Q3&!'62Q5(KNe,C!'q>:!'84Q#AA!'84!'84TaDGv$5+a.H:T!'q>:!'84Q6)!'95!'84Q5(!'51!'84!'q>:;1$+a.)!,; > t9q=T!W.$r(f2DRTYHY]abd]ein]n}}333
 	0788x/4/?/?fmm]c]j]j@k/l/n/nGLLOfmmO+,"X!AFAQAQfmmV]];Ba LLFMMA,===> "X!383C3CEMMRXR_R_agananDo3p3r3rGLLA-//0Y!^Q6 1d1g6I ++EMM&--,WXGL"X ++E2GL"X7<7G7G7NGLL06,,q00!34"X$//6GLL ++E2GLY *\ LC  Q	
 	P " 7==%++66s#   V0 W0!W
X,X

Xc                   x   ^  \ rS rSrS	S\\   S\4U 4S jjjrS\R                  S\R                  4S jr
SrU =r$ )
ImageGPTLayerNorm   hidden_sizeepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g N)super__init__rt   r   	ParameterrY   Tensorr1   )selfrs   rt   	__class__s      r(   rx   ImageGPTLayerNorm.__init__   s,    ll5<<#<=    tensorreturnc           	          U[         R                  " [         R                  " [         R                  " U5      SSS9U R                  -   5      -  nXR
                  -  nU$ )Nr+   T)axiskeepdim)rY   sqrtmeansquarert   r1   )r{   r   s     r(   forwardImageGPTLayerNorm.forward   sI    %**UZZV0D2W[%\_c_g_g%ghh++%r~   )rt   r1   )gh㈵>)__name__
__module____qualname____firstlineno__tuplerU   floatrx   rY   rz   r   __static_attributes____classcell__r|   s   @r(   rq   rq      s?    >E#J >U > >
ell u||  r~   rq   c                   X  ^  \ rS rSrSS\\   S\\   4U 4S jjjrS rSS jr	SS jr
S rS	 r        SS
\R                  S\\   S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\R                     S\4S jjrSrU =r$ )ImageGPTAttention   is_cross_attention	layer_idxc           
        > [         TU ]  5         UR                  nU R                  S[        R
                  " [        R                  " XD4[        R                  S95      R                  SSXD5      SS9  U R                  S[        R                  " S5      SS9  UR                  U l        UR                  U l        U R                  U R                  -  U l        U R                  U l        U R                  U R                  -  U R                  :w  a&  [!        SU R                   S	U R                   S
35      eUR"                  U l        X l        UR&                  U l        X0l        UR*                  U l        U R$                  (       aN  [-        SU R                  -  U R                  5      U l        [-        U R                  U R                  5      U l        O([-        SU R                  -  U R                  5      U l        [-        U R                  U R                  5      U l        [4        R6                  " UR8                  5      U l        [4        R6                  " UR<                  5      U l        [A        5       U l!        g )Nr3   dtyper   F)
persistentmasked_biasg     z=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r>   r
   )"rw   rx   max_position_embeddingsregister_bufferrY   trilonesboolviewr   rs   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsr   scale_attn_by_inverse_layer_idxr   reorder_and_upcast_attnr   r9   q_attnr;   r   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropoutsetpruned_heads)r{   ra   r   r   max_positionsr|   s        r(   rx   ImageGPTAttention.__init__   s   66JJuzz="@

STYY1m  	 	
 	]ELL,>5Q++33$..8..==4>>)T^^;OPTP^P^O_ `NN#2' 
 #)";";"4 06/U/U,"'-'E'E$"" T^^!3T^^DDK @DK T^^!3T^^DDKT^^T^^<JJv'8'89ZZ(:(:;Er~   c                 8   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[
        R                  " X"U R                  -   USU R                  -  -   /5      n[        U R                  USS9U l	        [        U R                  USS9U l
        U R                  U R                  -  U R                  [        U5      -
  -  U l        U R                  [        U5      -
  U l        U R                  R                  U5      U l        g )Nr   r>   r   dim)rT   r   r   r   r   rY   catr   r   r9   r;   union)r{   headsindex
index_attns       r(   prune_headsImageGPTAttention.prune_heads   s    u:?7~~t}}^b^o^opYYt'>T__I\@]^_
 )jaH(eC  ??dnn<RUV[R\A\]#e*4 --33E:r~   c                 T   [         R                  " XR                  SS5      5      nU R                  (       a   U[	        UR                  S5      S-  5      -  nU R                  (       a  U[        U R                  S-   5      -  nU R                  (       d  UR                  S5      UR                  S5      pU R                  S S 2S S 2X-
  U2S U24   n	[         R                  " UR                  5      R                  n
[         R                  " XR                  UR                  S9n
[         R                   " XU
5      nUb  Xd-   n["        R$                  " SS9" U5      nUR'                  UR                  5      nU R)                  U5      nUb  Xe-  n[         R                  " Xc5      nX4$ )Nr+         ?r   r   devicer   )rY   matmul	transposer   r   sizer   r   r   r   r3   finfor   minr   r   wherer   Softmaxtyper   )r{   querykeyvalueattention_mask	head_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputs               r(   _attnImageGPTAttention._attn   s`   ||E==R+@A""'+ejjn6K*LLL //'%0B*CCL&&',zz"~sxx|*))Aq**Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{*ML%'8Lzzb),7 $((5((6  '3Lll<7((r~   c           	         UR                  5       u  pgpUR                  5       u    pn
[        R                  " Xg-  X[        R                  UR                  S9nSnU R
                  (       a   U[        UR                  S5      5      S-  -  nU R                  (       a  U[        U R                  S-   5      -  n[        R                  " UR                  R                  SS9   UR                  SX5      UR                  SS5      R                  SX5      p[        R                  " XR                  5       UR                  5       S	US
9nUR                  XgX5      nS S S 5        U R                  (       d  UR                  S5      UR                  S5      nnU R                  S S 2S S 2UU-
  U2S U24   n[        R                   " UR"                  5      R$                  n[        R&                  " UUR"                  UR                  S9n[        R(                  " UUU5      nUb  X-   n[*        R,                  " SS9" U5      nUR"                  [        R                  :w  a  [/        S5      eUR                  UR"                  5      nU R1                  U5      nUb  X-  n[        R2                  " X5      nUU4$ ! , (       d  f       GNc= f)Nr         ?r+   r   r   F)enabledr   r   )betaalphar   zDError with upcasting, attn_weights does not have dtype torch.float32)r   rY   emptyfloat32r   r   r   r   r   autocastr   r[   r   baddbmmr   r3   r   r   r   r   r   r   r   RuntimeErrorr   r   )r{   r   r   r   r   r   bszr   	q_seq_lendk_	k_seq_lenr   scale_factorqkr   r   r   r   r   s                        r(   _upcast_and_reordered_attn,ImageGPTAttention._upcast_and_reordered_attn
  sG   (-

%	 XXZ1 {{3?IPUP]P]fkfrfrs ""E%**R.1S88L//E$..1"455L ^^ELL--u===Y3S]]2r5J5R5RSUWY5eq ==wwy!'')RS[ghL'//	UL >
 &&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'8Lzzb),7 .eff#((5((6  '3Lll<7L((C >=s   !A9J66
Kc                 v    UR                  5       SS X#4-   nUR                  " U6 nUR                  SSSS5      $ )z:
Splits hidden_size dim into attn_head_size and num_heads
Nr+   r   r>   r   r
   )r   r   permuter{   r   r   attn_head_size	new_shapes        r(   _split_headsImageGPTAttention._split_heads>  sA     KKM#2&))DD	i(~~aAq))r~   c                     UR                  SSSS5      R                  5       nUR                  5       SS X#-  4-   nUR                  U5      $ )zC
Merges attn_head_size dim and num_attn_heads dim into hidden_size
r   r>   r   r
   Nr   )r   
contiguousr   r   r   s        r(   _merge_headsImageGPTAttention._merge_headsF  sM     1a+668KKM#2&)*D)FF	{{9%%r~   hidden_states
layer_pastr   r   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionscache_positionr   c
                    US Ln
UR                   u  pnUb]  [        U[        5      (       aF  UR                  R	                  U R
                  5      nU
(       a  UR                  nOUR                  nOUnU
(       a  UOUnU
(       Ga-  [        U S5      (       d  [        S5      eUb`  W(       aY  U R                  U5      nWR                  U R
                     R                  nUR                  U R
                     R                  nGOKU R                  U5      nU R                  U5      R                  U R                   SS9u  nnUR#                  USU R$                  U R&                  5      R)                  SS5      nUR#                  USU R$                  U R&                  5      R)                  SS5      nOU R                  U5      R                  U R                   SS9u  nnnUR#                  USU R$                  U R&                  5      R)                  SS5      nUR#                  USU R$                  U R&                  5      R)                  SS5      nUbN  U
(       d  U	OS n	WR+                  UUU R
                  SU	05      u  nnU
(       a  SUR                  U R
                  '   UR#                  XU R$                  U R&                  5      R)                  SS5      nU R,                  (       a  U R/                  UUUX45      u  nnOU R1                  UUUX45      u  nnU R3                  UU R$                  U R&                  5      nU R5                  U5      nU R7                  U5      nUU4$ )	Nr   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.r>   r   r+   r   r   T)rV   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachehasattrr   r   layerskeysvaluesr9   rN   r   r   r   r   r   updater   r   r   r   r;   r   )r{   r   r   r   r   r   r   r   r   r   r   r   seq_lenr   r  curr_past_key_valuecurrent_statesr   r   r   r   r   s                         r(   r   ImageGPTAttention.forwardN  s    3$>'--a!*&9::'2266t~~F
%*4*J*J'*4*I*I'&0#2D.-4** t 
 %*M2)00@EE+224>>BIIM2![[8>>tTU>V
UhhsBFPPQRTUV

3DNNDMMJTTUVXYZ $N ; A A$//WX A YE3((3DNNDMMBLLQPQRCJJsBFPPQRTUVE!3E^4N,33CQacqPrsJC!8<
%%dnn5

3GQQRSUVW''(,(G(GsTY[i(u%K(,

5#un(`%K''T^^T]]Skk+.((5L((r~   )r   r9   r;   r   r   r   r   r   r   r   r   r   r   r   r   )FN)NNNNNNNFFN)r   r   r   r   r   r   rU   rx   r   r   r   r   r   rY   rz   r   r   r   r   r   r   s   @r(   r   r      s   )"8D> )"V^_bVc )" )"V;$)L2)h*& '+15,08<9=$),115D)||D) UOD) !.	D)
 ELL)D)  (5D) !) 6D) D>D) $D>D) !.D) 
D) D)r~   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ImageGPTMLPi  c                    > [         TU ]  5         UR                  n[        X5      U l        [        X15      U l        [        UR                     U l        [        R                  " UR                  5      U l        g rv   )rw   rx   rs   r   c_fcr;   r   activation_functionactr   r   r   dropout)r{   intermediate_sizera   r   r|   s       r(   rx   ImageGPTMLP.__init__  sZ    &&	,8	Y:&445zz&"4"45r~   r   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ rv   )r  r  r;   r  )r{   r   s     r(   r   ImageGPTMLP.forward  s@    		-0/M2]3r~   )r  r  r;   r  )
r   r   r   r   rx   rY   rz   r   r   r   r   s   @r(   r  r    s(    6U\\ ell  r~   r  c                     ^  \ rS rSrSU 4S jjr        SS\R                  S\\   S\\R                     S\\R                     S\\R                     S\\R                     S	\\	   S
\\	   S\\R                     S\
4S jjrSrU =r$ )ImageGPTBlocki  c                   > [         TU ]  5         UR                  nUR                  b  UR                  OSU-  n[	        X1R
                  S9U l        [        XS9U l        [	        X1R
                  S9U l	        UR                  (       a(  [        USUS9U l        [	        X1R
                  S9U l        [        XA5      U l        g )N   rt   r   T)r   r   )rw   rx   rs   n_innerrq   layer_norm_epsilonln_1r   r:   ln_2add_cross_attentioncrossattentionln_cross_attnr  mlp)r{   ra   r   rs   	inner_dimr|   s        r(   rx   ImageGPTBlock.__init__  s    ((&,nn&@FNNa+o	%k7P7PQ	%fB	%k7P7PQ	%%"3Ft_h"iD!2;D]D]!^Dy1r~   r   r   r   r   r   r   r   r   r   r   c
                    Un
U R                  U5      nU R                  UUUUUUU	S9nUS   nUSS  nX-   nUbY  [        U S5      (       d  [        SU  S35      eUn
U R	                  U5      nU R                  UUUUUUUU	S9nUS   nX-   nXSS  -   nUn
U R                  U5      nU R                  U5      nX-   nU4U-   $ )N)r   r   r   r   r   r   r   r   r%  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   r   r   r   r   r   r   )r"  r:   r  r   r&  r%  r#  r'  )r{   r   r   r   r   r   r   r   r   r   residualattn_outputsr   outputscross_attn_outputsfeed_forward_hidden_statess                   r(   r   ImageGPTBlock.forward  s3    !		-0yy!)/) ! 
 #1oqr"#. ,4!122 =dV DZ Z  %H ..}=M!%!4!4%-#&;'="3- "5 	" -Q/K$2M12 66G 		-0%)XXm%<" ='))r~   )r:   r%  r"  r#  r&  r'  rv   r  )r   r   r   r   rx   rY   rz   r   r   r   r   r   r   r   r   s   @r(   r  r    s    2$ '+15,08<9=$),115:*||:* UO:* !.	:*
 ELL):*  (5:* !) 6:* D>:* $D>:* !.:* 
:* :*r~   r  c                   P   ^  \ rS rSr% \\S'   \rSrSr	Sr
S/rU 4S jrS rS	rU =r$ )
ImageGPTPreTrainedModeli  ra   r.   	input_idsTr  c                 &   > [         TU ]  " U0 UD6  g rv   )rw   rx   )r{   inputskwargsr|   s      r(   rx    ImageGPTPreTrainedModel.__init__  s    &+F+r~   c           	         [        U[        R                  [        45      (       aj  UR                  R
                  R                  SU R                  R                  S9  UR                  b$  UR                  R
                  R                  5         O[        U[        R                  5      (       aw  UR                  R
                  R                  SU R                  R                  S9  UR                  b1  UR                  R
                  UR                     R                  5         O:[        U[        5      (       a%  UR                  R
                  R                  S5        UR                  5        Hq  u  p#SU;   d  M  SU;   d  M  UR
                  R                  SU R                  R                  [         R"                  " SU R                  R$                  -  5      -  S9  Ms     g)zInitialize the weights.g        )r   stdNr   r;   r1   r>   )r   r   Linearr   r1   r^   normal_ra   initializer_ranger3   zero_	Embeddingpadding_idxrq   fill_named_parametersmathr   n_layer)r{   modulerh   ps       r(   _init_weights%ImageGPTPreTrainedModel._init_weights  sS   fryy&122 MM&&CT[[5R5R&S{{&  &&(--MM&&CT[[5R5R&S!!-""6#5#56<<> 122MM$$S) ..0GD4H$4Cdkk.K.KdiiXY\`\g\g\o\oXoNp.pr 1r~   r%   )r   r   r   r   r   __annotations__ro   load_tf_weightsbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesrx   rF  r   r   r   s   @r(   r2  r2    s9    1O%!O&*#(),s sr~   r2  c            $         ^  \ rS rSrS\4U 4S jjrS rS rS r\	              SS\
\R                     S\
\\\R                           S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\   S\
\R                     S\S\\\4   4 S jj5       rSrU =r$ )ImageGPTModeli  ra   c           
      t  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  U R                  5      U l        [        R
                  " UR                  U R                  5      U l	        [        R                  " UR                  5      U l        [        R                  " [        UR                  5       Vs/ sH  n[!        XS9PM     sn5      U l        [%        U R                  UR&                  S9U l        SU l        S U l        SU l        U R1                  5         g s  snf )Nr  r  F)rw   rx   rs   r   r   r>  r_   r5   r   r4   r   
embd_pdropdrop
ModuleListrangenum_hidden_layersr  hrq   r!  ln_fmodel_parallel
device_mapgradient_checkpointing	post_init)r{   ra   ir|   s      r(   rx   ImageGPTModel.__init__  s     ++<< 1 14>>B<< > >OJJv001	ERXRjRjLklLkqf BLklm%dnn&:S:ST	 $&+#  ms   D5c                     U R                   $ rv   r5   )r{   s    r(   get_input_embeddings"ImageGPTModel.get_input_embeddings/  s    xxr~   c                     Xl         g rv   r_  )r{   new_embeddingss     r(   set_input_embeddings"ImageGPTModel.set_input_embeddings2  s    !r~   c                     UR                  5        H-  u  p#U R                  U   R                  R                  U5        M/     g)zf
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
N)itemsrV  r:   r   )r{   heads_to_prunelayerr   s       r(   _prune_headsImageGPTModel._prune_heads5  s5     +002LEFF5M**51 3r~   r3  past_key_valuesr   token_type_idsposition_idsr   inputs_embedsr   r   r   r   output_hidden_statesreturn_dictr   r6  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  Ub  [        S5      eUbF  U R                  X5        UR                  5       nUR                  SUS   5      nUR                  S   nO1Ub#  UR                  5       SS nUR                  S   nO[        S5      eUb  UR                  OUR                  nU R                  (       a/  U R                  (       a  U
(       a  [        R                  S5        Sn
SnU
(       aB  [        U[         5      (       d-  [        R                  S5        S	n["        R$                  " U5      nUb  UR'                  5       OUnUb  UR                  SUS   5      nUc<  [(        R*                  " UUS   U-   [(        R,                  US
9nUR/                  S5      nUby  US::  a  [        S5      eUR                  US5      nUSS2SSSS24   nUR1                  U R2                  S9nSU-
  [(        R4                  " U R2                  5      R6                  -  nU R                   R8                  (       aE  UbB  UR                  5       u  nnnUU4nU	c  [(        R:                  " UUS9n	U R=                  U	5      n	OSn	U R?                  X`R                   R@                  5      nUc  U RC                  U5      nU RE                  U5      nUUR1                  UR                  5      -   nUb  U RC                  U5      nUU-   nU RG                  U5      nUUR                  S5      4-   nU(       a  SOSnU(       a  U R                   R8                  (       a  SOSnU(       a  SOSn[I        U RJ                  5       GHq  u  n n!U RL                  (       a  [(        RN                  RQ                  UR                  5        Ub  UR1                  UR                  5      n[        U[(        RR                  5      (       a  UR1                  UR                  5      nU(       a  UU4-   nU!" UUUUU    UU	U
UUS9	n"U"S   nU(       a-  UU"S   4-   nU R                   R8                  (       a	  UU"S   4-   nU RL                  (       d  GM  U RT                  RW                  5        HO  u  n#n$U U$S   :X  d  M  S[Y        U#5      -   U RZ                  :w  d  M/  UR1                  S[Y        U#S-   5      -   5      nMQ     GMt     U R]                  U5      nUR                  " U6 nU(       a  UU4-   nU(       a  UR_                  5       nU(       d  [a        S UUUUU4 5       5      $ [c        UUUUUS9$ )aJ  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTModel
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer+   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.Tr   z$batch_size has to be defined and > 0r   r   )r   r%   )r   r   r   r   r   r>   zcuda:c              3   .   #    U H  nUc  M  Uv   M     g 7frv   r%   )r&   vs     r(   r)   (ImageGPTModel.forward.<locals>.<genexpr>  s      wA ws   	)last_hidden_staterl  r   
attentionscross_attentions)2ra   r   rp  r   use_return_dictr   %warn_if_padding_and_no_attention_maskr   r   rV   r   rZ  trainingrB   warning_oncer   r   r   from_legacy_cacheget_seq_lengthrY   arangelong	unsqueezetor   r   r   r$  r   invert_attention_maskget_head_maskrC  r5   r4   rR  	enumeraterV  rX  cuda
set_devicerz   rY  rg  strlast_devicerW  to_legacy_cacher   r   )%r{   r3  rl  r   rm  rn  r   ro  r   r   r   r   rp  rq  r   r6  input_shape
batch_sizer   return_legacy_cachepast_lengthencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeposition_embedsr   token_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesr\  blockr-  r   rt  s%                                        r(   r   ImageGPTModel.forward<  s   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66yQ#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T&&4==##p "	#Z??\
 #'1CCOTO:I:Uo446[j%+00[_EN <<[_{5RZ_ZdZdmstL'11!4L %Q !GHH+00R@N ,AtT1,<=N ,..TZZ.@N!N2ekk$**6M6Q6QQN ;;**/D/P=R=W=W=Y: 7$68O#P %-).4HQW)X&%)%?%?@V%W"%)" &&y++2E2EF	  HHY/M((<0%(:(:=;O;O(PP% $ 8),==M		-0"m&8&8&<%>>$5b4%64;;;Z;Zr`d"6BD!$&&)HAu""

%%m&:&:;!-%3%6%6}7K7K%LNi66 )]-A-A BI#$58H$H!!%'=#"3-
G $AJM &9WQZM&I#;;22+?71:-+O( """ OO113DAqAbEzgA&6$:J:J&J(5(8(83q1u:9M(N 4A *H 		-0%**L9   1]4D D-==?O ':KM`bvw   9+++*1
 	
r~   )	rY  rR  r   rZ  rV  rW  rX  r4   r5   )NNNNNNNNNNNNNN)r   r   r   r   r   rx   r`  rd  rj  r   r   rY   rz   r   r   r   r   r   r   r   r   r   s   @r(   rO  rO    s   ~ &"2  -1@D1515/3,0048<9=$(,0/3&*15N
ELL)N
 "%ell(;"<=N
 !.	N

 !.N
 u||,N
 ELL)N
  -N
  (5N
 !) 6N
 D>N
 $D>N
 'tnN
 d^N
 !.N
  !N
" 
u??	@#N
 N
r~   rO  z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            &         ^  \ rS rSrS/rS\4U 4S jjr\               SS\\	R                     S\\\\	R                           S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\S\\\4   4"S jj5       rSrU =r$ )ImageGPTForCausalImageModelingi  zlm_head.weightra   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  S-
  SS9U l        SU l	        S U l
        U R                  5         g )Nr   Fr3   )rw   rx   rO  r.   r   r:  r\   r_   r<   rX  rY  r[  r{   ra   r|   s     r(   rx   'ImageGPTForCausalImageModeling.__init__  s[     (0yy0A0AA0EER $r~   r3  rl  r   rm  rn  r   ro  r   r   labelsr   r   rp  rq  r   r6  r   c                 ,   Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUUUUS9nUS   nU R                  U5      nSnU
br  USSS2SS24   R	                  5       nU
SSS24   R	                  5       n[        5       nU" UR                  SUR                  S5      5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
>>> import torch
>>> import matplotlib.pyplot as plt
>>> import numpy as np

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
>>> model.to(device)  # doctest: +IGNORE_RESULT

>>> # unconditional generation of 8 images
>>> batch_size = 4
>>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
>>> context = context.to(device)
>>> output = model.generate(
...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
... )

>>> clusters = image_processor.clusters
>>> height = image_processor.size["height"]
>>> width = image_processor.size["width"]

>>> samples = output[:, 1:].detach().cpu().numpy()
>>> samples_img = [
...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
... ]  # convert color cluster tokens back to pixels
>>> f, axes = plt.subplots(1, batch_size, dpi=300)

>>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
...     ax.axis("off")
...     ax.imshow(img)
```N)rl  r   rm  rn  r   ro  r   r   r   r   rp  rq  r   r   .r+   r   )losslogitsrl  r   rw  rx  )ra   ry  r.   r<   r   r   r   r   r   rl  r   rw  rx  )r{   r3  rl  r   rm  rn  r   ro  r   r   r  r   r   rp  rq  r   r6  transformer_outputsr   	lm_logitsr  shift_logitsshift_labelsloss_fctoutputs                            r(   r   &ImageGPTForCausalImageModeling.forward"  sR   N &1%<k$++B]B]"..+))%'"7#9/!5#) / 
  ,A.LL/	$S#2#q[1<<>L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`D\$7$;;F)-)9TGf$EvE0/??-;;*550AA
 	
r~   )rY  r<   rX  r.   )NNNNNNNNNNNNNNN)r   r   r   r   _tied_weights_keysr   rx   r   r   rY   rz   r   r   r   r   r   r   r   r   r   s   @r(   r  r    s    ++	~ 	  -1@D1515/3,0048<9=)-$(,0/3&*15!p
ELL)p
 "%ell(;"<=p
 !.	p

 !.p
 u||,p
 ELL)p
  -p
  (5p
 !) 6p
 &p
 D>p
 $D>p
 'tnp
 d^p
  !.!p
" #p
$ 
u77	8%p
 p
r~   r  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                      ^  \ rS rSrS\4U 4S jjr\            SS\\R                     S\\
\
\R                           S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\S\\
\4   4S jj5       rSrU =r$ )ImageGPTForImageClassificationi  ra   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g )NFr  )
rw   rx   
num_labelsrO  r.   r   r:  r\   scorer[  r  s     r(   rx   'ImageGPTForImageClassification.__init__  sR      ++(0YYv}}dooEJ
 	r~   r3  rl  r   rm  rn  r   ro  r  r   r   rp  rq  r6  r   c                 l   Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   nUR                  SS9nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aJ  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" UU5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  S	U R                  5      UR                  S	5      5      nO-U R                   R
                  S:X  a  [        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  UR&                  S
9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
```N)
rl  r   rm  rn  r   ro  r   r   rp  rq  r   r   r   
regressionsingle_label_classificationmulti_label_classificationr+   )r  r  rl  r   rw  )ra   ry  r.   r   r  problem_typer  r   rY   r  rU   r	   rL   r   r   r   r   rl  r   rw  )r{   r3  rl  r   rm  rn  r   ro  r  r   r   rp  rq  r6  r  r   pooled_hidden_statesr  r  r  r  s                        r(   r   &ImageGPTForImageClassification.forward  s   d &1%<k$++B]B]"..+))%'/!5# / 
 ,A.,11a1801{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y!4QR!88F)-)9TGf$EvE//??-;;*55
 	
r~   )r  r  r.   )NNNNNNNNNNNN)r   r   r   r   r   rx   r   r   rY   rz   r   r   r   r   r   r   r   r   r   s   @r(   r  r    sP   ~   -1@D1515/3,004)-$(,0/3&*f
ELL)f
 "%ell(;"<=f
 !.	f

 !.f
 u||,f
 ELL)f
  -f
 &f
 D>f
 $D>f
 'tnf
 d^f
 f
 
u66	7f
 f
r~   r  )r  r  rO  r2  ro   )5__doc__rB  rD   typingr   r   r   rY   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   cache_utilsr   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_imagegptr   
get_loggerr   rB   ro   Modulerq   r   r  r  r2  rO  r  r  __all__r%   r~   r(   <module>r     sM   %  	 ' '    A A ! 5 ) 9 
 . Y Y 
 3 
		H	%iX
		 
i)		 i)X")) "J*. J*Z #so #s #sL p
+ p
 p
f 
%<o 

D q
%< q
q
hr~   