
    <h}                        S r SSKJrJr  SSKrSSKrSSKJr  SSKJ	r	J
r
Jr  SSKJrJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJrJr  SSKJrJr  SSKJr  \R@                  " \!5      r"S r#S r$S#S jr% " S S\RL                  5      r'S r( " S S\RL                  5      r)\ " S S\5      5       r*\ " S S\*5      5       r+\" SS9 " S S\*\5      5       r,\" SS9 " S  S!\*5      5       r-/ S"Qr.g)$zPyTorch CTRL model.    )OptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )CacheDynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPastSequenceClassifierOutput)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
CTRLConfigc                 P    S[         R                  " SSUS-  -  U-  5      -  nX-  $ )Nr   i'     )torchpow)posid_model_sizeangle_ratess       ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/ctrl/modeling_ctrl.py
angle_defnr!   (   s-    eiiQ!V'DEEK    c                    [        [        R                  " U [        R                  S9R	                  U5      R                  S5      [        R                  " U[        R                  S9R	                  U5      R                  S5      U5      n[        R                  " US S 2SS S24   5      n[        R                  " US S 2SS S24   5      n[        R                  " XE/SS9nU$ )Ndtyper   r   r   dim)	r!   r   arangeint64to	unsqueezesincoscat)positionr   r%   
angle_radssinescosinespos_encodings          r    positional_encodingr5   -   s    XU[[144U;EEaH\588?II!LJ IIjADqD)*Eii
1add7+,G99e-26Lr"   c           	         [         R                  " XR                  SSSS5      5      nUR                  S   nU[        R
                  " U5      -  nUb3  UR                  S5      UR                  S5      pXX-
  U
2S U
24   S-  -  nUb  X-   n[         R                  " USS9nUb  X-  n[         R                  " X5      nX4$ )	Nr   r   r	   r   r&   g     r'   )r   matmulpermuteshapenpsqrtsizesoftmax)qkvmaskattention_mask	head_mask	matmul_qkdkscaled_attention_logitsndnsattention_weightsoutputs                r    scaled_dot_product_attentionrL   <   s    Q		!Q1 56I	
B'"''"+5(--b13J3O3OPR3SB"crc(9#:T#AA!"9"J&=2F -9\\+/F$$r"   c                   N   ^  \ rS rSrSU 4S jjrS rS r      SS jrSrU =r	$ )	MultiHeadAttentionV   c                 t  > [         TU ]  5         X l        Xl        X0l        [        XR                  -  5      U l        [        R                  " X5      U l	        [        R                  " X5      U l
        [        R                  " X5      U l        [        R                  " X5      U l        [        5       U l        g N)super__init__	num_headsr   	layer_idxintdepthr   LinearWqWkWvdensesetpruned_heads)selfr   rT   rU   	__class__s       r    rS   MultiHeadAttention.__init__W   s{    "("67
))L7))L7))L7YY|:
Er"   c                    U R                   U R                  -  n[        U5      S:X  a  g [        XR                  X R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        X R                  -  U l         U R                  R                  U5      U l        g )Nr   r   r'   )r   rT   lenr   r^   r   rY   rZ   r[   r\   union)r_   headsattention_head_sizeindexs       r    prune_headsMultiHeadAttention.prune_headsf   s    "//4>>Au:?7~~Obduduv %TWWe4$TWWe4$TWWe4'

EqA
 #e*4/..@ --33E:r"   c                 x    UR                  USU R                  U R                  5      nUR                  / SQ5      $ )Nr&   r   r   r   r	   )reshaperT   rW   r9   )r_   x
batch_sizes      r    split_into_heads#MultiHeadAttention.split_into_headsw   s-    IIj"dnndjjAyy&&r"   c                    UR                   S   nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  X;5      nU R	                  X+5      nU R	                  X5      nUb!  UR                  X!U R                  SU
05      u  p![        X2XXg5      nUS   R                  / SQ5      nUS   nUR                  USU R                  5      nU R                  U5      nX4$ )Nr   cache_positionrk   r   r&   )r:   rY   rZ   r[   ro   updaterU   rL   r9   rl   r   r\   )r_   rA   r@   r?   rB   
layer_pastrC   rD   	use_cacheoutput_attentionsrr   rn   rK   scaled_attentionattnoriginal_size_attentions                   r    forwardMultiHeadAttention.forward{   s     WWQZ
GGAJGGAJGGAJ!!!0!!!0!!!0!$$Q4>><Ln;]^DA-aA^W!!9,,\:ay"2":"::r4K\K\"]34|r"   )	rZ   rY   r[   r   r\   rW   rU   rT   r^   rQ   NNNFFN)
__name__
__module____qualname____firstlineno__rS   rh   ro   rz   __static_attributes____classcell__r`   s   @r    rN   rN   V   s0    ";"'  r"   rN   c                     [         R                  " [         R                  " X5      [         R                  " 5       [         R                  " X5      5      $ rQ   )r   
SequentialrX   ReLU)r   dffs     r    point_wise_feed_forward_networkr      s-    ==<5rwwy"))CB^__r"   c                   B   ^  \ rS rSrSU 4S jjr      SS jrSrU =r$ )EncoderLayer   c                 4  > [         TU ]  5         [        XUS9U l        [	        X5      U l        [        R                  " USS9U l        [        R                  " USS9U l	        [        R                  " U5      U l        [        R                  " U5      U l        g )NrU   gư>eps)rR   rS   rN   multi_head_attentionr   ffnr   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)r_   r   rT   r   raterU   r`   s         r    rS   EncoderLayer.__init__   sn    $6|Zc$d!2<E,,|>,,|>

4(

4(r"   c	                    U R                  U5      n	U R                  U	U	U	UUUUUUUS9
n
U
S   nU R                  U5      nX-   nU R                  U5      nU R	                  U5      nU R                  U5      nX-   nU4U
SS  -   nU$ )Nrt   rC   rD   ru   rv   rr   r   r   )r   r   r   r   r   r   )r_   rm   rB   rt   rC   rD   ru   rv   rr   normedattn_outputsattn_outputout1out2
ffn_outputoutputss                   r    rz   EncoderLayer.forward   s     #00!)/) 1 
 #1ommK0t$XXd^
]]:.
 'L,,r"   )r   r   r   r   r   r   )g?Nr|   )r}   r~   r   r   rS   rz   r   r   r   s   @r    r   r      s&    
)  " "r"   r   c                   *    \ rS rSr% \\S'   SrS rSrg)CTRLPreTrainedModel   configtransformerc                 $   [        U[        R                  [        45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weights.g        )meanstdN      ?)
isinstancer   rX   r   weightdatanormal_r   initializer_rangebiaszero_	Embeddingpadding_idxr   fill_)r_   modules     r    _init_weights!CTRLPreTrainedModel._init_weights   s   fryy&122 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r"    N)	r}   r~   r   r   r   __annotations__base_model_prefixr   r   r   r"   r    r   r      s    %*r"   r   c                     ^  \ rS rSrU 4S jrS rS rS r\            SS\	\
R                     S\	\\\
R                           S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\
R                     \4   4S jj5       rSrU =r$ )	CTRLModel   c                   > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  U R                  [        R                  5      U l
        [        R                  " UR                  UR                  5      U l        [        R                  " UR                   5      U l        [        R$                  " ['        UR                  5       Vs/ sH8  n[)        UR                  UR*                  UR,                  UR.                  US9PM:     sn5      U l        [        R2                  " UR                  UR4                  S9U l        U R9                  5         g s  snf )Nr   r   )rR   rS   n_embdr   n_layer
num_layersr5   n_positionsr   floatr4   r   r   
vocab_sizewr   
embd_pdropdropout
ModuleListranger   n_headr   resid_pdrophr   layer_norm_epsilon	layernorm	post_init)r_   r   r   r`   s      r    rS   CTRLModel.__init__   s    "MM ../0B0BDDUDUW\WbWbcf//?zz&"3"34 v~~..A V]]FMM6::vGYGYefg.
 fmm9R9RS 	s   #>E,c                     U R                   $ rQ   r   )r_   s    r    get_input_embeddingsCTRLModel.get_input_embeddings  s    vvr"   c                     Xl         g rQ   r   )r_   new_embeddingss     r    set_input_embeddingsCTRLModel.set_input_embeddings  s    r"   c                     UR                  5        H-  u  p#U R                  U   R                  R                  U5        M/     g)zf
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
N)itemsr   r   rh   )r_   heads_to_prunelayerre   s       r    _prune_headsCTRLModel._prune_heads  s5     +002LEFF5M..::5A 3r"   	input_idspast_key_valuesrC   token_type_idsposition_idsrD   inputs_embedsru   rv   output_hidden_statesreturn_dictrr   returnc                    U	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  Ub  [        S5      eUbF  U R                  X5        UR                  5       nUR                  SUS   5      nUR                  S   nO1Ub#  UR                  5       SS nUR                  S   nO[        S5      eUb  UR                  OUR                  nSnU(       aB  [        U[        5      (       d-  [        R                  S5        Sn[        R                   " U5      nUb  UR#                  5       OSnUc<  [$        R&                  " UUS   U-   [$        R(                  US	9nUR+                  S5      nUb  US::  a  [        S
5      eUR                  US5      nUR+                  S5      R+                  S5      nUR-                  U R.                  S9nSU-
  [$        R0                  " U R.                  5      R2                  -  nU R5                  X`R                   R6                  5      nUbJ  UR                  SUS   5      nU R9                  U5      nU[:        R<                  " U R>                  5      -  nOSnUc  U R9                  U5      nUS   n[$        R@                  " [$        RB                  " UU-   UU-   5      S5      R-                  U5      nU[:        R<                  " U R>                  5      -  nU RD                  R-                  U5      U l"        U RD                  USS24   nUU-   U-   nU RG                  U5      nU
(       a  SOSnU	(       a  SOSn[I        U RJ                  5       H:  u  nnU
(       a  UU4-   nU" UUUUUU   UU	US9nUS   nU	(       d  M1  UUS   4-  nM<     U RM                  U5      nU
(       a  UU4-   nU(       a  URO                  5       nU(       d  [Q        S UUUU4 5       5      $ [S        UUUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
    [`PreTrainedTokenizer.encode`] for details.

    [What are input IDs?](../glossary#input-ids)

Example:

```python
>>> from transformers import AutoTokenizer, CTRLModel
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLModel.from_pretrained("Salesforce/ctrl")

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 5, 1280]
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer&   r   z5You have to specify either input_ids or inputs_embedsFzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.T)r%   devicez$batch_size has to be defined and > 0r   r   r$   r   r   r   c              3   ,   #    U H  oc  M  Uv   M     g 7frQ   r   ).0rA   s     r    	<genexpr>$CTRLModel.forward.<locals>.<genexpr>  s      ^a^s   	)last_hidden_stater   hidden_states
attentions)*r   rv   ru   r   use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr=   viewr:   r   r   r
   loggerwarning_oncer   from_legacy_cacheget_seq_lengthr   r)   longr,   r+   r%   finfominget_head_maskr   r   r;   r<   r   triuonesr4   r   	enumerater   r   to_legacy_cachetupler   )r_   r   r   rC   r   r   rD   r   ru   rv   r   r   rr   kwargsinput_shapern   r   return_legacy_cachepast_lengthtoken_type_embedsseq_lenrB   
pos_embedsr   all_hidden_statesall_attentionsr   r   r   s                                r    rz   CTRLModel.forward  s:   b 2C1N-TXT_T_TqTq!*!6IDKK<Q<Q	$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T#Z??U
 #'*<<_MO:I:Uo446[\ <<[_{5RZ_ZdZdmstL'11!4L %Q !GHH+00R@N ,55a8BB1EN ,..TZZ.@N!N2ekk$**6M6Q6QQN &&y++2E2EF	%+00[_EN $~ 6):):!;; !  FF9-Mb/zz%**W{%:Gk<QRTUVYYZ`a!2!233 !--008&&|Q7
%
25FF]3"6BD0ddff%DAq#$58H$H!*-#A,#"3-	G $AJM  71:-/ &" }5 1]4D D-==?O )?<M~^   '+++%	
 	
r"   )r   r   r   r   r   r4   r   NNNNNNNNNNNN)r}   r~   r   r   rS   r   r   r   r   r   r   
LongTensorr  FloatTensorboolTensorr   r   rz   r   r   r   s   @r    r   r      se   , B  15EI6:59371559$(,0/3&*15g
E,,-g
 "%e.?.?(@"ABg
 !!2!23	g

 !!1!12g
 u//0g
 E--.g
   1 12g
 D>g
 $D>g
 'tng
 d^g
 !.g
 
uU\\"$;;	<g
 g
r"   r   z
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                      ^  \ rS rSrS/rU 4S jr\             SS\\R                     S\\
\
\R                           S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\
\R                     \4   4S jj5       rSS jrSrU =r$ )CTRLLMHeadModeli  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NTr   )
rR   rS   r   r   r   rX   r   r   lm_headr   r_   r   r`   s     r    rS   CTRLLMHeadModel.__init__  sG     $V,yy0A0AM 	r"   r   r   rC   r   r   rD   r   labelsru   rv   r   r   rr   r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUUS9nUS   nU R                  U5      nSnUb*  U R                  " UU4SU R                   R
                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
    [`PreTrainedTokenizer.encode`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CTRLLMHeadModel

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> sequence_ids = model.generate(inputs["input_ids"])
>>> sequences = tokenizer.batch_decode(sequence_ids)
>>> sequences
['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']

>>> outputs = model(**inputs, labels=inputs["input_ids"])
>>> round(outputs.loss.item(), 2)
9.21

>>> list(outputs.logits.shape)
[1, 5, 246534]
```N)r   rC   r   r   rD   r   ru   rv   r   r   rr   r   r   r   )losslogitsr   r   r   )
r   r   r   r  loss_functionr   r   r   r   r   )r_   r   r   rC   r   r   rD   r   r  ru   rv   r   r   rr   r  transformer_outputsr   	lm_logitsr  rK   s                       r    rz   CTRLLMHeadModel.forward  s   x &1%<k$++B]B]"..+))%'/!5#) / 
 ,A.LL/	%%  ;;11 	D \$7$;;F)-)9TGf$EvE%/??-;;*55
 	
r"   c                     UbC  UR                  5       nUR                  S   U:  a  UnOUR                  S   S-
  nUS S 2US 24   nXUS.$ )Nr   )r   r   ru   )r   r:   )r_   r   r   ru   r  r  remove_prefix_lengths          r    prepare_inputs_for_generation-CTRLLMHeadModel.prepare_inputs_for_generation0  sf     &)88:K q!K/'2$ (1q'9A'=$!!%9%:":;I&Ybccr"   )r  r   )NNNNNNNNNNNNNNN)r}   r~   r   r   _tied_weights_keysrS   r   r   r   r  r  r  r  r  r   r   rz   r$  r   r   r   s   @r    r  r    s    ++  15EI6:59371559-1$(,0/3&*15c
E,,-c
 "%e.?.?(@"ABc
 !!2!23	c

 !!1!12c
 u//0c
 E--.c
   1 12c
 ))*c
 D>c
 $D>c
 'tnc
 d^c
 !.c
  
uU\\"$::	;!c
 c
Jd dr"   r  a  
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\	\	\R                           S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\\	\R                     \4   4S jj5       rSrU =r$ )CTRLForSequenceClassificationiC  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g )NFr  )
rR   rS   
num_labelsr   r   r   rX   r   
classifierr   r  s     r    rS   &CTRLForSequenceClassification.__init__O  sR      ++$V,))FMM4??O 	r"   r   r   rC   r   r   rD   r   r  ru   rv   r   r   r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   nU R                  U5      nUb  UR                  SS u  nnOUR                  SS u  nnU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S	35        U[        R                  " UUR                  S
9U4   nSnUGb  U R                   R"                  c  U R$                  S:X  a  SU R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  S9$ )a"  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
    [`PreTrainedTokenizer.encode`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Example of single-label classification:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CTRLForSequenceClassification

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_class_id = logits.argmax().item()
>>> model.config.id2label[predicted_class_id]
'LABEL_0'
```

```python
>>> import torch

>>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
>>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
>>> num_labels = len(model.config.id2label)
>>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

>>> labels = torch.tensor(1)
>>> loss = model(**inputs, labels=labels).loss
>>> round(loss.item(), 2)
0.93
```

Example of multi-label classification:

```python
>>> import torch
>>> from transformers import AutoTokenizer, CTRLForSequenceClassification

>>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
>>> model = CTRLForSequenceClassification.from_pretrained(
...     "Salesforce/ctrl", problem_type="multi_label_classification"
... )

>>> # CTRL was trained with control codes as the first token
>>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
>>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_class_id = logits.argmax().item()
>>> model.config.id2label[predicted_class_id]
'LABEL_0'
```

```python
>>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
>>> num_labels = len(model.config.id2label)
>>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

>>> num_labels = len(model.config.id2label)
>>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
...     torch.float
... )
>>> loss = model(**inputs, labels=labels).loss
>>> loss.backward()  # doctest: +IGNORE_RESULT
```N)
r   rC   r   r   rD   r   ru   rv   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r&   )r   r%   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r  r  r   r   )r   r   r   r,  r:   pad_token_idr   r+   r   r   int32r)   argmaxr   r   r`   r}   problem_typer+  r%   r   rV   r   squeezer   r   r   r   r   r   )r_   r   r   rC   r   r   rD   r   r  ru   rv   r   r   r  r   r  rn   sequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctrK   s                            r    rz   %CTRLForSequenceClassification.forwardX  s   P &1%<k$++B]B]"..+))%'/!5# / 
 ,A./ *3//"1*='J*7*=*=bq*A'J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE' -;;*55	
 	
r"   )r,  r+  r   r  )r}   r~   r   r   rS   r   r   r   r  r  r  r  r   r  r   rz   r   r   r   s   @r    r)  r)  C  sW     15EI6:59371559-1$(,0/3&*p
E,,-p
 "%e.?.?(@"ABp
 !!2!23	p

 !!1!12p
 u//0p
 E--.p
   1 12p
 ))*p
 D>p
 $D>p
 'tnp
 d^p
 
uU\\"$<<	=p
 p
r"   r)  )r)  r  r   r   r&  )/__doc__typingr   r   numpyr;   r   r   torch.nnr   r   r   cache_utilsr
   r   
generationr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_ctrlr   
get_loggerr}   r   r!   r5   rL   ModulerN   r   r   r   r   r  r)  __all__r   r"   r    <module>rL     s"     "    A A . ) i i - Y Y + 
		H	%
%4D DN`/299 /d */ * ** L
# L
 L
^ Ad)? AdAdH 
{
$7 {

{
| cr"   