
    h$                     <   d Z ddlZddlZddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$  e!jJ                  e&      Z'd Z( G d dejR                        Z* G d dejR                        Z+ G d dejR                        Z, G d de      Z-e  G d de             Z.e  G d de.             Z/ e d       G d  d!e.e             Z0 e d"       G d# d$e.             Z1g d%Z2y)&zPyTorch OpenAI ImageGPT model.    N)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_conv1d_layer)auto_docstringloggingtorch_float   )ImageGPTConfigc                    	 ddl }ddl}t
        j                  j                  |      }t        j                  d|        |j                  j                  |      }g }g }|D ]l  \  }	}
t        j                  d|	 d|
        |j                  j                  ||	      }|j                  |	       |j                  |j                                n t        ||      D ]  \  }	}|	dd }	|	j                  d      }	t!        d	 |	D              s|	d
   dv r4t        j                  dj#                  dj%                  |	                   j| }|	d
   dvrt'        |d      }|	D ]W  }|j)                  d|      r|j                  d|      }n|g}|d   dk(  s|d   dk(  rt'        |d      }n|d   dk(  rt'        |d      }n|d   dk(  s|d   dk(  rt'        ||d         }t'        |d      }n|d   dv rt'        |d      }t'        |d      }nt+        |	      dk(  r,|	d   dk(  r$|d   dk(  rt'        ||d         }t'        |d      }nQ|d   dk(  rt'        |d      }t'        |d      }n0|d   d k(  rt'        |d      }t'        |d      }nt'        ||d         }t+        |      d!k\  sEt-        |d         }||   }Z t+        |	      dkD  r|	d   dk(  s|	d
   dk(  s|	d
   d k(  s|	d
   dk(  rn	 |j.                  |j.                  k(  sJ 	 t        j                  d"|	        |	d
   d#k(  rbt5        j6                  |j9                  |j:                  |j:                              j<                  |j>                  ddd|j:                  f<   |	d
   d$k(  rot5        j6                  |j9                  |j:                  |j:                              j<                  |j>                  dd|j:                  d!|j:                  z  f<   !|	d
   d%k(  ret5        j6                  |j9                  |j:                  |j:                              j<                  |j>                  ddd!|j:                  z  df<   t+        |	      dk(  rP|	d   dk(  rH|	d!   dk(  r@t5        j6                  |j9                  |j:                  |j:                              |_        |	d
   dk(  rt5        j6                  |      |_        |	d
   dk(  r7t5        j6                  |      |j>                  d|j@                  dz
  ddf<   O|	d
   d k(  r$t5        j6                  |      |j>                  d
<   {t5        j6                  |      |_         | S # t        $ r t        j	                  d        w xY w# t0        $ r1}|xj2                  |j.                  |j.                  fz  c_         d}~ww xY w)&z0
    Load tf checkpoints in a pytorch model
    r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape    /c              3   $   K   | ]  }|d v  
 yw))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     m/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py	<genexpr>z.load_tf_weights_in_imagegpt.<locals>.<genexpr>P   s      
 nn   )_stepzSkipping {})wtettransformerz[A-Za-z]+\d+z(\d+)wgweightbbiaswpewte)q_projk_projv_projc_attnr   r   attnc_projr,   lm_headsos   zInitialize PyTorch weight r5   r6   r7   )!re
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendsqueezezipsplitanyformatjoingetattr	fullmatchlenintshapeAssertionErrorargstorch
from_numpyreshapen_embdTdata
vocab_size)modelconfigimagegpt_checkpoint_pathr>   tftf_path	init_varsnamesarraysnamerU   arraypointerm_namescope_namesnumes                    r'   load_tf_weights_in_imagegptrn   /   sh   	 ggoo67G
KK8	BC''0IEF e(l5'BC&&w5Temmo&	 ! 5&)eABxzz#  

 
 "X"KK,,SXXd^<=88#g}5GF||OV4 hhx8%h1~$A#(=!'84Q3&!'62Q5(KNe,C!';q>:!'84Q#AA!'84!'84TaDGv$5+a.H:T!';q>:!'84Q6)!'95!'84Q5(!'51!'84!';q>:;1$+a.)!#,; > t9q=T!W.$r(f2DRTYHY]abd]ein]n}}333
 	0788x/4/?/?fmm]c]j]j@k/l/n/nGLLOfmmO+,"X!AFAQAQfmmV]];Ba LLFMMA,===> "X!383C3CEMMRXR_R_agananDo3p3r3rGLLA-//0Y!^Q6 1d1g6I ++EMM&--,WXGL"X ++E2GL"X7<7G7G7NGLL06,,q00!34"X$//6GLL ++E2GLY *\ LC  Q	
 	P " 7==%++66s#   U7 )V7 V	W#,WWc                   h     e Zd Zddee   def fdZdej                  dej                  fdZ	 xZ
S )ImageGPTLayerNormhidden_sizeepsc                     t         |           || _        t        j                  t        j                  |            | _        y N)super__init__rr   r   	ParameterrX   Tensorr0   )selfrq   rr   	__class__s      r'   rv   zImageGPTLayerNorm.__init__   s.    ll5<<#<=    tensorreturnc                     |t        j                  t        j                  t        j                  |      dd      | j                  z         z  }|| j
                  z  }|S )Nr*   T)axiskeepdim)rX   sqrtmeansquarerr   r0   )ry   r|   s     r'   forwardzImageGPTLayerNorm.forward   sK    %**UZZV0D2W[%\_c_g_g%ghh$++%r{   )gh㈵>)__name__
__module____qualname__tuplerT   floatrv   rX   rx   r   __classcell__rz   s   @r'   rp   rp      s5    >E#J >U >
ell u|| r{   rp   c                   B    e Zd Zddee   dee   f fdZd ZddZddZ	d Z
d Z	 	 	 	 	 	 	 	 dd	ej                  d
ee   deej                     deej                     deej                     deej                     dee   dee   deej                     defdZ xZS )ImageGPTAttentionis_cross_attention	layer_idxc           	         t         |           |j                  }| j                  dt	        j
                  t	        j                  ||ft        j                              j                  dd||      d       | j                  dt	        j                  d      d       |j                  | _        |j                  | _        | j                  | j                  z  | _        | j                  | _        | j                  | j                  z  | j                  k7  r&t!        d| j                   d	| j                   d
      |j"                  | _        || _        |j&                  | _        || _        |j*                  | _        | j$                  rNt-        d| j                  z  | j                        | _        t-        | j                  | j                        | _        n(t-        d| j                  z  | j                        | _        t-        | j                  | j                        | _        t5        j6                  |j8                        | _        t5        j6                  |j<                        | _        tA               | _!        y )Nr2   dtyper   F)
persistentmasked_biasg     z=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r=   r   )"ru   rv   max_position_embeddingsregister_bufferrX   trilonesboolviewr|   rq   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsr   scale_attn_by_inverse_layer_idxr   reorder_and_upcast_attnr   r8   q_attnr:   r   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropoutsetpruned_heads)ry   r`   r   r   max_positionsrz   s        r'   rv   zImageGPTAttention.__init__   s   66JJuzz=-"@

STYY1m]  	 	
 	]ELL,>5Q++33$..8..==4>>)T^^;OPTP^P^O_ `NN#2' 
 #)";";"4 06/U/U,"'-'E'E$"" T^^!3T^^DDK @DK T^^!3T^^DDKT^^T^^<JJv'8'89ZZ(:(:;Er{   c                 F   t        |      dk(  ry t        || j                  | j                  | j                        \  }}t        j                  ||| j                  z   |d| j                  z  z   g      }t        | j                  |d      | _	        t        | j                  |d      | _
        | j                  | j                  z  | j                  t        |      z
  z  | _        | j                  t        |      z
  | _        | j                  j                  |      | _        y )Nr   r=   r   dim)rS   r   r   r   r   rX   catr   r   r8   r:   union)ry   headsindex
index_attns       r'   prune_headszImageGPTAttention.prune_heads   s    u:?7t~~t}}^b^o^opuYYut'>T__I\@]^_
 )jaH(eC  ??dnn<RUV[R\A\]#e*4 --33E:r{   c                 D   t        j                  ||j                  dd            }| j                  r |t	        |j                  d      dz        z  }| j                  r|t        | j                  dz         z  }| j                  s|j                  d      |j                  d      }}| j                  d d d d ||z
  |d |f   }	t        j                  |j                        j                  }
t        j                  |
|j                  |j                        }
t        j                   |	||
      }|||z   } t#        j$                  d      |      }|j'                  |j                        }| j)                  |      }|||z  }t        j                  ||      }||fS )Nr*         ?r   r   devicer   )rX   matmul	transposer   r   sizer   r   r   r   r2   finfor   minr|   r   wherer   Softmaxtyper   )ry   querykeyvalueattention_mask	head_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputs               r'   _attnzImageGPTAttention._attn   st   ||E3==R+@A""'+ejjn6K*LLL //'%0B*CCL&&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'.8L)rzzb),7 $((5((6  ')3Lll<7L((r{   c                    |j                         \  }}}}	|j                         \  }
}
}}
t        j                  ||z  ||t        j                  |j                        }d}| j
                  r |t        |j                  d            dz  z  }| j                  r|t        | j                  dz         z  }t        j                  |j                  j                  d      5  |j                  d||	      |j                  dd      j                  d|	|      }}t        j                  ||j                         |j                         d	|
      }|j                  ||||      }d d d        | j                  s|j                  d      |j                  d      }}| j                  d d d d ||z
  |d |f   }t        j                   |j"                        j$                  }t        j&                  ||j"                  |j                        }t        j(                  |||      }|||z   } t+        j,                  d      |      }|j"                  t        j                  k7  rt/        d      |j                  |j"                        }| j1                  |      }|||z  }t        j2                  ||      }||fS # 1 sw Y   ZxY w)Nr         ?r*   r   r   F)enabledr   r   )betaalphar   zDError with upcasting, attn_weights does not have dtype torch.float32)r   rX   emptyfloat32r   r   r   r   r   autocastr   rZ   r   baddbmmr   r2   r   r   r   r|   r   r   r   RuntimeErrorr   r   )ry   r   r   r   r   r   bszr   	q_seq_lendk_	k_seq_lenr   scale_factorqkr   r   r   r   r   s                        r'   _upcast_and_reordered_attnz,ImageGPTAttention._upcast_and_reordered_attn	  sf   (-

%Y	2 XXZ1i {{3?IyPUP]P]fkfrfrs ""E%**R.1S88L//E$..1"455L ^^ELL--u===Y3S]]2r5J5R5RSUWY[d5eqA ==qwwy!'')RS[ghL'//Y	9UL >
 &&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'.8L)rzzb),7 .eff#((5((6  ')3Lll<7L((C >=s    BJ99Kc                 x    |j                         dd ||fz   } |j                  | }|j                  dddd      S )zJ
        Splits hidden_size dim into attn_head_size and num_heads
        Nr*   r   r=   r   r   )r   r   permutery   r|   r   attn_head_size	new_shapes        r'   _split_headszImageGPTAttention._split_heads=  sE     KKM#2&)^)DD	i(~~aAq))r{   c                     |j                  dddd      j                         }|j                         dd ||z  fz   }|j                  |      S )zS
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r=   r   r   Nr   )r   
contiguousr   r   r   s        r'   _merge_headszImageGPTAttention._merge_headsE  sO     1a+668KKM#2&)n*D)FF	{{9%%r{   hidden_states
layer_pastr   r   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionscache_positionr}   c
                    |d u}
|j                   \  }}}|St        |t              rA|j                  j	                  | j
                        }|
r|j                  }n|j                  }n|}|
r|n|}|
r%t        | d      st        d      |[rY| j                  |      }j                  | j
                     j                  }|j                  | j
                     j                  }nQ| j                  |      }| j                  |      j                  | j                   d      \  }}|j#                  |d| j$                  | j&                        j)                  dd      }|j#                  |d| j$                  | j&                        j)                  dd      }n| j                  |      j                  | j                   d      \  }}}|j#                  |d| j$                  | j&                        j)                  dd      }|j#                  |d| j$                  | j&                        j)                  dd      }|D|
s|	nd }	j+                  ||| j
                  d|	i      \  }}|
rd|j                  | j
                  <   |j#                  ||| j$                  | j&                        j)                  dd      }| j,                  r| j/                  |||||      \  }}n| j1                  |||||      \  }}| j3                  || j$                  | j&                        }| j5                  |      }| j7                  |      }||fS )	Nr   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.r=   r   r*   r   r   T)rU   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachehasattrr   r   layerskeysvaluesr8   rM   r   r   r   r   r   updater   r   r   r   r:   r   )ry   r   r   r   r   r   r   r   r   r   r   r   seq_lenr   r   curr_past_key_valuecurrent_statesr   r   r   r   r   s                         r'   r   zImageGPTAttention.forwardM  s    3$>'--Wa!*&9:'2266t~~F
%*4*J*J'*4*I*I'&0#2D.-4* t 
 %*M2)00@EE+224>>BIIM2![[8>>tTU>V
UhhsBFPPQRTUV

3DNNDMMJTTUVXYZ $N ; A A$//WX A YE3((3DNNDMMBLLQPQRCJJsBFPPQRTUVE!3E^4N,33CQacqPrsJC!8<
%%dnn5

3GQQRSUVW''(,(G(GsTY[ikt(u%K(,

5#unV_(`%K''T^^T]]Skk+.((5L((r{   )FN)NNNNNNNFFN)r   r   r   r   r   rT   rv   r   r   r   r   r   rX   rx   r
   r   r   r   r   s   @r'   r   r      s   )"8D> )"V^_bVc )"V;$)L2)h*& '+15,08<9=$),115D)||D) UOD) !.	D)
 ELL)D)  (5D) !) 6D) D>D) $D>D) !.D) 
D)r{   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ImageGPTMLPc                     t         |           |j                  }t        ||      | _        t        ||      | _        t        |j                     | _        t        j                  |j                        | _        y rt   )ru   rv   rq   r   c_fcr:   r	   activation_functionactr   r   r   dropout)ry   intermediate_sizer`   r   rz   s       r'   rv   zImageGPTMLP.__init__  s_    &&	,i8	Y(9:&445zz&"4"45r{   r   r}   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rt   )r  r  r:   r  )ry   r   s     r'   r   zImageGPTMLP.forward  s@    		-0/M2]3r{   )r   r   r   rv   rX   rx   r   r   r   s   @r'   r  r    s#    6U\\ ell r{   r  c                   
    e Zd Zd fd	Z	 	 	 	 	 	 	 	 ddej
                  dee   deej
                     deej
                     deej
                     deej
                     dee   d	ee   d
eej
                     de	fdZ
 xZS )ImageGPTBlockc                    t         |           |j                  }|j                  |j                  nd|z  }t	        ||j
                        | _        t        ||      | _        t	        ||j
                        | _	        |j                  r/t        |d|      | _        t	        ||j
                        | _        t        ||      | _        y )N   rr   r   T)r   r   )ru   rv   rq   n_innerrp   layer_norm_epsilonln_1r   r9   ln_2add_cross_attentioncrossattentionln_cross_attnr  mlp)ry   r`   r   rq   	inner_dimrz   s        r'   rv   zImageGPTBlock.__init__  s    ((&,nn&@FNNa+o	%kv7P7PQ	%f	B	%kv7P7PQ	%%"3Ft_h"iD!2;FD]D]!^Dy&1r{   r   r   r   r   r   r   r   r   r   r}   c
           
         |}
| j                  |      }| j                  |||||||	      }|d   }|dd  }||
z   }|Yt        | d      st        d|  d      |}
| j	                  |      }| j                  ||||||||	      }|d   }|
|z   }||dd  z   }|}
| j                  |      }| j                  |      }|
|z   }|f|z   S )N)r   r   r   r   r   r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   r   r   r   r   r   r   )r  r9   r   r   r  r  r  r  )ry   r   r   r   r   r   r   r   r   r   residualattn_outputsr   outputscross_attn_outputsfeed_forward_hidden_statess                   r'   r   zImageGPTBlock.forward  s:    !		-0yy!)/) ! 
 #1oqr"#h. ,4!12 =dV DZ Z  %H ..}=M!%!4!4%-#&;'="3- "5 	" -Q/K${2M 212 66G 		-0%)XXm%<" #=='))r{   rt   r   )r   r   r   rv   rX   rx   r   r
   r   r   r   r   r   s   @r'   r  r    s    2$ '+15,08<9=$),115:*||:* UO:* !.	:*
 ELL):*  (5:* !) 6:* D>:* $D>:* !.:* 
:*r{   r  c                   F     e Zd ZU eed<   eZdZdZdZ	dgZ
 fdZd Z xZS )ImageGPTPreTrainedModelr`   r-   	input_idsTr  c                 $    t        |   |i | y rt   )ru   rv   )ry   inputskwargsrz   s      r'   rv   z ImageGPTPreTrainedModel.__init__  s    &+F+r{   c           	         t        |t        j                  t        f      rl|j                  j
                  j                  d| j                  j                         |j                  |j                  j
                  j                          nt        |t        j                        ry|j                  j
                  j                  d| j                  j                         |j                  g|j                  j
                  |j                     j                          n5t        |t              r%|j                  j
                  j                  d       |j                         D ]m  \  }}d|v sd|v s|j
                  j                  d| j                  j                  t!        j"                  d| j                  j$                  z        z         o y)zInitialize the weights.g        )r   stdNr   r:   r0   r=   )r   r   Linearr   r0   r]   normal_r`   initializer_ranger2   zero_	Embeddingpadding_idxrp   fill_named_parametersmathr   n_layer)ry   modulerg   ps       r'   _init_weightsz%ImageGPTPreTrainedModel._init_weights  sJ   fryy&12 MM&&CT[[5R5R&S{{&  &&(-MM&&CT[[5R5R&S!!-""6#5#56<<> 12MM$$S) ..0GD!4H$4Cdkk.K.KdiiXY\`\g\g\o\oXoNp.pr 1r{   )r   r   r   r   __annotations__rn   load_tf_weightsbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesrv   r3  r   r   s   @r'   r   r     s3    1O%!O&*#(),sr{   r   c            $           e Zd Zdef fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e   de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e   de	e   de	e   de	e   de	e
j                     dedeeef   f d       Z xZS )ImageGPTModelr`   c           	      v   t         |   |       |j                  | _        t	        j
                  |j                  | j                        | _        t	        j
                  |j                  | j                        | _	        t	        j                  |j                        | _        t	        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t%        | j                  |j&                        | _        d| _        d | _        d| _        | j1                          y c c}w )Nr  r  F)ru   rv   rq   r   r   r+  r^   r4   r   r3   r   
embd_pdropdrop
ModuleListrangenum_hidden_layersr  hrp   r  ln_fmodel_parallel
device_mapgradient_checkpointing	post_init)ry   r`   irz   s      r'   rv   zImageGPTModel.__init__  s     ++<< 1 14>>B<< > >OJJv001	ERXRjRjLklLkqf BLklm%dnn&:S:ST	 $&+#  ms   
D6c                     | j                   S rt   r4   )ry   s    r'   get_input_embeddingsz"ImageGPTModel.get_input_embeddings.  s    xxr{   c                     || _         y rt   rJ  )ry   new_embeddingss     r'   set_input_embeddingsz"ImageGPTModel.set_input_embeddings1  s	    !r{   c                     |j                         D ]-  \  }}| j                  |   j                  j                  |       / y)zv
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsrB  r9   r   )ry   heads_to_prunelayerr   s       r'   _prune_headszImageGPTModel._prune_heads4  s7     +002LE5FF5M**51 3r{   r!  past_key_valuesr   token_type_idsposition_idsr   inputs_embedsr   r   r   r   output_hidden_statesreturn_dictr   r$  r}   c                 
   ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
||n| j                   j                  }||t        d      |G| j                  ||       |j                         }|j                  d|d         }|j                  d   }n0|#|j                         dd }|j                  d   }nt        d      ||j                  n|j                  }| j                  r%| j                  r|
rt        j                  d       d}
|
r6|4t        t!        | j                         t!        | j                               }|
r:t#        |t$              r*t        j                  d	       t        j&                  |      }||j)                         n|}||j                  d|d         }|>t+        j,                  ||d   |z   t*        j.                  |
      }|j1                  d      }|z|dk  rt        d      |j                  |d      }|ddddddf   }|j3                  | j4                        }d|z
  t+        j6                  | j4                        j8                  z  }| j                   j:                  rE|C|j                         \  }}}||f}|	t+        j<                  ||      }	| j?                  |	      }	nd}	| jA                  || j                   jB                        }|| jE                  |      }| jG                  |      }||j3                  |j                        z   }|| jE                  |      }||z   }| jI                  |      }||j                  d      fz   }|rdnd}|r| j                   j:                  rdnd}|rdnd}tK        | jL                        D ]L  \  }} | jN                  r{t*        jP                  jS                  |j                         ||j3                  |j                        }t#        |t*        jT                        r|j3                  |j                        }|r||fz   } | |||||   ||	|
||	      }!|!d   }|r(||!d   fz   }| j                   j:                  r	||!d   fz   }| jN                  s| jV                  jY                         D ]J  \  }"}#||#d   k(  sdt[        |"      z   | j\                  k7  s+|j3                  dt[        |"dz         z         }L O | j_                  |      } |j                  | }|r||fz   }|st%        d |||||fD              S ta        |||||      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer*   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r`   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   z$batch_size has to be defined and > 0r   r   )r   r$   )r   r   r   r   r   r=   zcuda:c              3   $   K   | ]  }|| 
 y wrt   r$   )r%   vs     r'   r(   z(ImageGPTModel.forward.<locals>.<genexpr>  s      wA= wr)   )last_hidden_staterT  r   
attentionscross_attentions)1r`   r   rX  r   use_return_dictr   %warn_if_padding_and_no_attention_maskr   r   rU   r   rF  trainingrA   warning_oncer   r   r   r   from_legacy_cacheget_seq_lengthrX   arangelong	unsqueezetor   r   r   r  r   invert_attention_maskget_head_maskr0  r4   r3   r>  	enumeraterB  rD  cuda
set_devicerx   rE  rP  strlast_devicerC  r   )$ry   r!  rT  r   rU  rV  r   rW  r   r   r   r   rX  rY  r   r$  input_shape
batch_sizer   past_lengthencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeposition_embedsr   token_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesrH  blockr  r   r\  s$                                       r'   r   zImageGPTModel.forward;  s   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66y.Q#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T&&4==##p "	01,dkk2RT`hlhshsTtuOOU;\
 2CCOTO:I:Uo446[j%+00[_EN <<[_{5RZ_ZdZdmstL'11!4L %Q !GHH+00R@N ,AtT1,<=N ,..TZZ.@N!N2ekk$**6M6Q6QQN ;;**/D/P=R=W=W=Y: 7$68O#P %-).4HQW)X&%)%?%?@V%W"%)" &&y$++2E2EF	  HHY/M((<0%(:(:=;O;O(PP% $ 8),==M		-0"m&8&8&<%>>$5b4%64;;;Z;Zr`d"6BD!$&&)HAu""

%%m&:&:;!-%3%6%6}7K7K%LNi6 )]-A-A BI#$58H$H!!%'=#"3-
G $AJM &9WQZM&I#;;22+?71:-+O( "" OO113DAqAbEzgA&6$:J:J&J(5(8(83q1u:9M(N 4A *H 		-0***L9   1]4D D ':KM`bvw   9+++*1
 	
r{   )NNNNNNNNNNNNNN)r   r   r   r   rv   rK  rN  rS  r   r   rX   rx   r
   r   r   r   r   r   r   r   r   s   @r'   r;  r;    s~   ~ &"2  -1+/1515/3,0048<9=$(,0/3&*15K
ELL)K
 "%K
 !.	K

 !.K
 u||,K
 ELL)K
  -K
  (5K
 !) 6K
 D>K
 $D>K
 'tnK
 d^K
 !.K
  !K
" 
u??	@#K
 K
r{   r;  z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            &           e Zd ZdgZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     dee
   deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                     dee   dee   dee   dee   deej                     dedeeef   f"d       Z xZS )ImageGPTForCausalImageModelingzlm_head.weightr`   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  dz
  d      | _        d| _	        d | _
        | j                          y )Nr   Fr2   )ru   rv   r;  r-   r   r'  r[   r^   r;   rD  rE  rG  ry   r`   rz   s     r'   rv   z'ImageGPTForCausalImageModeling.__init__  s[     (0yy0A0AA0EER $r{   r!  rT  r   rU  rV  r   rW  r   r   labelsr   r   rX  rY  r   r$  r}   c                 (   ||n| j                   j                  }| j                  |||||||||	|||||      }|d   }| j                  |      }d}|
r|dddddf   j	                         }|
dddf   j	                         }t               } ||j                  d|j                  d            |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a&
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
        >>> import torch
        >>> import matplotlib.pyplot as plt
        >>> import numpy as np

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> model.to(device)  # doctest: +IGNORE_RESULT

        >>> # unconditional generation of 8 images
        >>> batch_size = 4
        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
        >>> context = context.to(device)
        >>> output = model.generate(
        ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
        ... )

        >>> clusters = image_processor.clusters
        >>> height = image_processor.size["height"]
        >>> width = image_processor.size["width"]

        >>> samples = output[:, 1:].detach().cpu().numpy()
        >>> samples_img = [
        ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
        ... ]  # convert color cluster tokens back to pixels
        >>> f, axes = plt.subplots(1, batch_size, dpi=300)

        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
        ...     ax.axis("off")
        ...     ax.imshow(img)
        ```N)rT  r   rU  rV  r   rW  r   r   r   r   rX  rY  r   r   .r*   r   )losslogitsrT  r   r^  r_  )r`   r`  r-   r;   r   r   r   r   r   rT  r   r^  r_  )ry   r!  rT  r   rU  rV  r   rW  r   r   r  r   r   rX  rY  r   r$  transformer_outputsr   	lm_logitsr  shift_logitsshift_labelsloss_fctoutputs                            r'   r   z&ImageGPTForCausalImageModeling.forward  sR   N &1%<k$++B]B]"..+))%'"7#9/!5#) / 
  ,A.LL/	$S#2#q[1<<>L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`D\$7$;;F)-)9TGf$EvE0/??-;;*550AA
 	
r{   )NNNNNNNNNNNNNNN)r   r   r   _tied_weights_keysr   rv   r   r   rX   rx   r
   r   r   r   r   r   r   r   r   s   @r'   r  r  
  s    ++	~ 	  -1+/1515/3,0048<9=)-$(,0/3&*15!p
ELL)p
 "%p
 !.	p

 !.p
 u||,p
 ELL)p
  -p
  (5p
 !) 6p
 &p
 D>p
 $D>p
 'tnp
 d^p
  !.!p
" #p
$ 
u77	8%p
 p
r{   r  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                    f    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     dee	   deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee
   dee
   dee
   dee
   dedeeef   fd       Z xZS )ImageGPTForImageClassificationr`   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y )NFr  )
ru   rv   
num_labelsr;  r-   r   r'  r[   scorerG  r  s     r'   rv   z'ImageGPTForImageClassification.__init__  sR      ++(0YYv}}dooEJ
 	r{   r!  rT  r   rU  rV  r   rW  r  r   r   rX  rY  r$  r}   c                    ||n| j                   j                  }| j                  ||||||||	|
||      }|d   }|j                  d      }| j	                  |      }d}|| j                  ||| j                         }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )ay  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```N)
rT  r   rU  rV  r   rW  r   r   rX  rY  r   r   r   )r  r  rT  r   r^  )
r`   r`  r-   r   r  loss_functionr   rT  r   r^  )ry   r!  rT  r   rU  rV  r   rW  r  r   r   rX  rY  r$  r  r   pooled_hidden_statesr  r  r  s                       r'   r   z&ImageGPTForImageClassification.forward  s    d &1%<k$++B]B]"..+))%'/!5# / 
 ,A.,11a1801%%ffdkkBDY!4QR!88F)-)9TGf$EvE//??-;;*55
 	
r{   )NNNNNNNNNNNN)r   r   r   r   rv   r   r   rX   rx   r
   r   r   r   r   r   r   r   r   s   @r'   r  r    sA   ~   -1+/1515/3,004)-$(,0/3&*T
ELL)T
 "%T
 !.	T

 !.T
 u||,T
 ELL)T
  -T
 &T
 D>T
 $D>T
 'tnT
 d^T
 T
 
u66	7T
 T
r{   r  )r  r  r;  r   rn   )3__doc__r/  rC   typingr   r   r   rX   r   torch.nnr   activationsr	   cache_utilsr
   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_imagegptr   
get_loggerr   rA   rn   Modulerp   r   r  r  r   r;  r  r  __all__r$   r{   r'   <module>r     sL   %  	 ' '   % ! C C ) 9 
 . Y Y 
 3 
		H	%iX
		 
i)		 i)X")) "J*. J*Z #so #s #sL m
+ m
 m
` 
%<o 

D _
%< _
_
Dr{   