ó
    <±ha[  ã                   óÒ  • S SK JrJrJr  S SKrS SKJr  SSKJr  SSK	J
r
  SSKJrJr  SSKJrJr  SSKJr  SS	KJrJr  S
SKJrJrJrJrJrJrJr  S
SKJr  S
SK J!r!J"r"  \RF                  " \$5      r% " S S\5      r& " S S\5      r' " S S\!5      r( " S S\5      r) " S S\5      r* " S S\5      r+ " S S\5      r, " S S\5      r-\ " S S\5      5       r. " S  S!\5      r// S"Qr0g)#é    )ÚCallableÚOptionalÚUnionNé   )ÚPretrainedConfig)ÚFlashAttentionKwargs)ÚBaseModelOutputÚBaseModelOutputWithPooling)ÚALL_ATTENTION_FUNCTIONSÚPreTrainedModel)ÚUnpack)Úauto_docstringÚloggingé   )ÚCLIPMLPÚCLIPAttentionÚCLIPEncoderÚCLIPEncoderLayerÚCLIPVisionEmbeddingsÚCLIPVisionModelÚCLIPVisionTransformer)Úeager_attention_forward)ÚVisionRotaryEmbeddingÚapply_rotary_pos_emb_visionc                   óR   ^ • \ rS rSrSrSrSr             SU 4S jjrSrU =r	$ )ÚMLCDVisionConfigé*   af  
This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
[DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1664):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 8192):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    projection_dim (`int`, *optional*, defaults to 1024):
        Dimensionality of text and vision projection layers.
    num_hidden_layers (`int`, *optional*, defaults to 48):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        The number of input channels.
    image_size (`int`, *optional*, defaults to 336):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 14):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).

Example:

```python
>>> from transformers import MLCDVisionConfig, MLCDVisionModel

>>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
>>> configuration = MLCDVisionConfig()

>>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
>>> model = MLCDVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Úmlcd_vision_modelÚvision_configc                 óÂ   >• [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        X€l        Xpl	        XÀl
        XÐl        X°l        X l        Xl        g )N© )ÚsuperÚ__init__Úhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚnum_key_value_groupsÚnum_channelsÚ
patch_sizeÚ
image_sizeÚinitializer_rangeÚinitializer_factorÚattention_dropoutÚlayer_norm_epsÚ
hidden_act)Úselfr$   r%   r&   r'   r(   r)   r+   r*   r0   r/   r.   r,   r-   ÚkwargsÚ	__class__s                  €Ú]/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/mlcd/modular_mlcd.pyr#   ÚMLCDVisionConfig.__init__d   s`   ø€ ô" 	‰ÒÑ"˜6Ò"à&ÔØ!2ÔØ!2ÔØ#6Ô Ø$8Ô!Ø(ÔØ$ŒØ$ŒØ!2ÔØ"4ÔØ!2ÔØ,ÔØ$ó    )r.   r0   r$   r+   r-   r,   r%   r/   r'   r)   r&   r(   r*   )i€  i    é0   é   é   r   iP  é   Úgelugñhãˆµøä>ç        g{®Gáz”?ç      ð?)
Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú
model_typeÚbase_config_keyr#   Ú__static_attributes__Ú__classcell__©r3   s   @r4   r   r   *   sH   ø† ñ4ðl %€JØ%€Oð ØØØØØØØØØØØØ÷%õ %r6   r   c                   ó   • \ rS rSrSrg)ÚMLCDMLPé†   r!   N)r>   r?   r@   rA   rE   r!   r6   r4   rI   rI   †   s   † Úr6   rI   c                   ó>   • \ rS rSrS\S\S\R                  4S jrSrg)ÚMLCDRotaryEmbeddingéŠ   Únum_patches_heightÚnum_patches_widthÚreturnc                 ó~  • [         R                  " XR                  R                  S9R	                  S5      R                  SU5      n[         R                  " X R                  R                  S9R	                  S5      R                  US5      n[         R                  " UR                  5       UR                  5       /SS9n[        X5      n[         R                  " X`R                  R                  U R                  R                  S9n[         R                  " XpR                  5      nX…   R                  S5      n	U	$ )aE  
Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

Args:
    num_patches_height (int): Number of patches in the height dimension.
    num_patches_width (int): Number of patches in the width dimension.

Returns:
    torch.Tensor: Rotary positional embeddings for the given grid size.
)Údevicer9   éÿÿÿÿr   ©Údim)rR   Údtype)ÚtorchÚarangeÚinv_freqrR   Ú	unsqueezeÚexpandÚstackÚflattenÚmaxrV   Úouter)
r1   rN   rO   Úhpos_idsÚwpos_idsÚpos_idsÚmax_grid_sizeÚseqÚrotary_pos_emb_fullÚrotary_pos_embs
             r4   ÚforwardÚMLCDRotaryEmbedding.forward‹   sü   € ô LŠLÐ+·M±M×4HÑ4HÑI×SÑSÐTUÓV×]Ñ]Ð^`ÐbsÓtð 	ô LŠLÐ*·=±=×3GÑ3GÑH×RÑRÐSTÓU×\Ñ\Ð]oÐqsÓtð 	ô
 —+’+˜x×/Ñ/Ó1°8×3CÑ3CÓ3EÐFÈBÑOˆô Ð.ÓBˆÜlŠl˜=·±×1EÑ1EÈTÏ]É]×M`ÑM`ÑaˆÜ#Ÿkšk¨#¯}©}Ó=Ðð -Ñ5×=Ñ=¸aÓ@ˆàÐr6   r!   N)	r>   r?   r@   rA   ÚintrW   ÚTensorrg   rE   r!   r6   r4   rL   rL   Š   s    † ð¨#ð À#ð È%Ï,É,÷ r6   rL   c                   ój   ^ • \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )ÚMLCDVisionEmbeddingsé¬   Úconfigc                 ó(   >• [         TU ]  U5        U ?g ©N)r"   r#   Úposition_embedding©r1   rn   r3   s     €r4   r#   ÚMLCDVisionEmbeddings.__init__­   s   ø€ Ü‰Ñ˜Ô ØÑ#r6   Úpixel_valuesrP   c                 óH  • UR                   S   nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n[        R                  " XT/SS9nU$ )Nr   )rV   r   r9   rS   rT   )ÚshapeÚpatch_embeddingÚweightrV   Útor]   Ú	transposeÚclass_embeddingr[   rW   Úcat)r1   rt   Ú
batch_sizeÚtarget_dtypeÚpatch_embedsÚclass_embedsÚ
embeddingss          r4   rg   ÚMLCDVisionEmbeddings.forward±   s”   € Ø!×'Ñ'¨Ñ*ˆ
Ø×+Ñ+×2Ñ2×8Ñ8ˆà×+Ñ+¨L¯O©OÀ,¨OÐ,OÓPˆØ#×+Ñ+¨AÓ.×8Ñ8¸¸AÓ>ˆà×+Ñ+×2Ñ2°:¸qÀ"ÓEˆÜ—Y’Y Ð;ÀÑCˆ
àÐr6   r!   )r>   r?   r@   rA   r   r#   rW   ÚFloatTensorrj   rg   rE   rF   rG   s   @r4   rl   rl   ¬   s2   ø† ð$Ð/÷ $ð
 E×$5Ñ$5ð 
¸%¿,¹,÷ 
ò 
r6   rl   c                   óö   ^ • \ rS rSrSrS\4U 4S jjr SS\R                  S\	\R                  \R                  4   S\
\R                     S\\   S	\	\R                  \
\R                     4   4
S
 jjrSrU =r$ )ÚMLCDAttentioné¾   zëMulti-headed attention with RoPE. Refer to papers:
- Attention is all you need:
    https://huggingface.co/papers/1706.03762
- RoFormer: Enhanced Transformer with Rotary Position Embedding:
    https://huggingface.co/papers/2104.09864
rn   c                 óT   >• [         TU ]  U5        UR                  U l        SU l        g ©NF)r"   r#   r(   Ú	is_causalrr   s     €r4   r#   ÚMLCDAttention.__init__Æ   s%   ø€ Ü‰Ñ˜Ô Ø$*×$?Ñ$?ˆÔ!Øˆr6   Úhidden_statesÚposition_embeddingsÚattention_maskr2   rP   c                 ó²  • UR                   S S u  pVU R                  U5      R                  XVU R                  U R                  45      nU R                  U5      R                  XVU R                  U R                  45      nU R                  U5      R                  XVU R                  U R                  45      n	US   R                  S5      R                  5       n
US   R                  S5      R                  5       n[        XxX«5      u  pxUR                  SSSS5      R                  5       nUR                  SSSS5      R                  5       nU	R                  SSSS5      R                  5       n	[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UUU	U4U R                   (       d  SOU R"                  U R$                  U R&                  S.UD6u  pÞUR                  SSSS5      R                  5       nUR)                  XeS5      nU R+                  U5      nUR                  SSS5      R                  5       nXÞ4$ )	NrS   r   r9   r   r   Úeagerr<   )ÚdropoutÚscalingr‰   )rv   Úq_projÚreshapeÚ	num_headsÚhead_dimÚk_projÚv_projrZ   Úfloatr   ÚpermuteÚ
contiguousr   rn   Ú_attn_implementationr   Útrainingr   Úscaler‰   ÚviewÚout_proj)r1   r‹   rŒ   r   r2   r}   Ú
seq_lengthÚquery_statesÚ
key_statesÚvalue_statesÚcosÚsinÚattention_interfaceÚattn_outputÚattn_weightss                  r4   rg   ÚMLCDAttention.forwardË   s0  € ð "/×!4Ñ!4°S°bÐ!9Ñˆ
ð —{‘{ =Ó1×9Ñ9¸:ÐSW×SaÑSaÐcg×cpÑcpÐ:qÓrˆØ—[‘[ Ó/×7Ñ7¸ÐQU×Q_ÑQ_Ðae×anÑanÐ8oÓpˆ
Ø—{‘{ =Ó1×9Ñ9¸:ÐSW×SaÑSaÐcg×cpÑcpÐ:qÓrˆð " !Ñ$×.Ñ.¨qÓ1×7Ñ7Ó9ˆØ! !Ñ$×.Ñ.¨qÓ1×7Ñ7Ó9ˆÜ#>¸|ÐY\Ó#bÑ ˆð $×+Ñ+¨A¨q°!°QÓ7×BÑBÓDˆØ×'Ñ'¨¨1¨a°Ó3×>Ñ>Ó@ˆ
Ø#×+Ñ+¨A¨q°!°QÓ7×BÑBÓDˆä(?ÐØ;‰;×+Ñ+¨wÓ6Ü"9¸$¿+¹+×:ZÑ:ZÑ"[Ðá$7ØØØØØð
%
ð  $Ÿ}Ÿ}‘C°$·,±,Ø—J‘JØ—n‘nñ
%
ð ñ
%
Ñ!ˆð "×)Ñ)¨!¨Q°°1Ó5×@Ñ@ÓBˆØ!×&Ñ& z¸rÓBˆØ—m‘m KÓ0ˆØ!×)Ñ)¨!¨Q°Ó2×=Ñ=Ó?ˆØÐ(Ð(r6   )r‰   r(   rp   )r>   r?   r@   rA   rB   r   r#   rW   rj   Útupler   r   r   rg   rE   rF   rG   s   @r4   r…   r…   ¾   s   ø† ñðÐ/÷ ð 26ñ	,)à—|‘|ð,)ð # 5§<¡<°·±Ð#=Ñ>ð,)ð ! §¡Ñ.ð	,)ð
 Ð-Ñ.ð,)ð 
ˆu|‰|˜X e§l¡lÑ3Ð3Ñ	4÷,)ó ,)r6   r…   c                   óÖ   ^ • \ rS rSrS\4U 4S jjr  SS\R                  S\\R                  \R                  4   S\	\R                     S\	\
   S\\R                     4
S	 jjrS
rU =r$ )ÚMLCDEncoderLayeréú   rn   c                 óD   >• [         TU ]  U5        [        U5      U l        g rp   )r"   r#   r…   Ú	self_attnrr   s     €r4   r#   ÚMLCDEncoderLayer.__init__û   s   ø€ Ü‰Ñ˜Ô Ü& vÓ.ˆr6   r‹   rŒ   r   Úoutput_attentionsrP   c                 óÊ   • UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a;  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
        Represents the hidden states from the previous layer or the input embeddings.
    position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
        A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
        Represents absolute positional embeddings for the query and key in the attention mechanism.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
©r‹   rŒ   r   r±   )Úlayer_norm1r¯   Úlayer_norm2Úmlp)r1   r‹   rŒ   r   r±   Úresidualr¨   Úoutputss           r4   rg   ÚMLCDEncoderLayer.forwardÿ   sŠ   € ð* !ˆà×(Ñ(¨Ó7ˆØ&*§n¡nØ'Ø 3Ø)Ø/ð	 '5ð '
Ñ#ˆð !Ñ0ˆà ˆØ×(Ñ(¨Ó7ˆØŸ™ Ó/ˆØ Ñ0ˆà Ð"ˆæØÑ&ˆGàˆr6   )r¯   rˆ   )r>   r?   r@   rA   r   r#   rW   rj   rª   r   Úboolrƒ   rg   rE   rF   rG   s   @r4   r¬   r¬   ú   s€   ø† ð/Ð/÷ /ð 26Ø,1ñ*à—|‘|ð*ð # 5§<¡<°·±Ð#=Ñ>ð*ð ! §¡Ñ.ð	*ð
 $ D™>ð*ð 
ˆu× Ñ Ñ	!÷*ó *r6   r¬   c                   óâ   ^ • \ rS rSrSrS\4U 4S jjr    SS\R                  S\	\R                  \R                  4   S\\R                     S\\   S	\\   S
\\   S\\	\4   4S jjrSrU =r$ )ÚMLCDEncoderi,  zŸ
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`MLCDEncoderLayer`].

Args:
    config: MLCDVisionConfig
rn   c                 ó$   >• [         TU ]  U5        g)z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r"   r#   rr   s     €r4   r#   ÚMLCDEncoder.__init__5  s   ø€ ä‰Ñ˜Õ r6   Úinputs_embedsrŒ   r   r±   Úoutput_hidden_statesÚreturn_dictrP   c                 óÊ  • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       H0  u  p«U(       a  Xy4-   nU" U	UUUS9nUS   n	U(       d  M(  XŒS   4-   nM2     U(       a  Xy4-   nU(       d  [        S X—U4 5       5      $ [        U	UUS9$ )aº  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
        A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
        Represents absolute positional embeddings for the query and key in the attention mechanism.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr!   r³   r   r9   c              3   ó,   #   • U H  oc  M  Uv •  M     g 7frp   r!   )Ú.0Úvs     r4   Ú	<genexpr>Ú&MLCDEncoder.forward.<locals>.<genexpr>w  s   é € ÐeÑ$S˜qŸ™Ò$Sùs   ‚‹	)Úlast_hidden_stater‹   Ú
attentions)rn   rÀ   Úuse_return_dictr±   Ú	enumerateÚlayersrª   r	   )r1   r¿   rŒ   r   r±   rÀ   rÁ   Úencoder_statesÚall_attentionsr‹   ÚidxÚencoder_layerÚlayer_outputss                r4   rg   ÚMLCDEncoder.forward9  s  € ðD %9Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ1BÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐæ3™¸ˆÞ0™°dˆà%ˆÜ"+¨D¯K©KÖ"8ÑˆCÞ#Ø!/Ð2BÑ!BÙ)Ø+Ø$7Ø-Ø"3ñ	ˆMð *¨!Ñ,ˆMç Ð Ø!/ÀÑ3CÐ2EÑ!E’ñ #9ö  Ø+Ð.>Ñ>ˆNæÜÑe ]ÀNÑ$SÓeÓeÐeÜØ+Ø(Ø%ñ
ð 	
r6   r!   ©NNNN)r>   r?   r@   rA   rB   r   r#   rW   rƒ   rª   rj   r   rº   r   r	   rg   rE   rF   rG   s   @r4   r¼   r¼   ,  s±   ø† ñð!Ð/÷ !ð 26Ø,0Ø/3Ø&*ñC
à×(Ñ(ðC
ð # 5§<¡<°·±Ð#=Ñ>ðC
ð ! §¡Ñ.ð	C
ð
 $ D™>ðC
ð ' t™nðC
ð ˜d‘^ðC
ð 
ˆuoÐ%Ñ	&÷C
ó C
r6   r¼   c                   óš   ^ • \ rS rSrS\4U 4S jjr\    SS\\R                     S\\
   S\\
   S\\
   S\\\4   4
S	 jj5       rS
rU =r$ )ÚMLCDVisionTransformeri  rn   c                 ó  >• [         TU ]  U5        [        UR                  UR                  -  S-  5      U l        [        R                  " [        R                  " SUR                  UR                  -  S-  5      5      U l
        g )Nr   r9   )r"   r#   rL   r$   r'   Úvision_rotary_embeddingÚnnÚ	ParameterrW   ÚrandnÚclass_pos_embrr   s     €r4   r#   ÚMLCDVisionTransformer.__init__€  sh   ø€ Ü‰Ñ˜Ô Ü':¸6×;MÑ;MÐQW×QkÑQkÑ;kÐopÑ;pÓ'qˆÔ$ÜŸ\š\¬%¯+ª+°a¸×9KÑ9KÈv×OiÑOiÑ9iÐmnÑ9nÓ*oÓpˆÕr6   rt   r±   rÀ   rÁ   rP   c                 ó€  • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUR
                  S   U R                   R                  -  nUR
                  S   U R                   R                  -  nU R                  XV5      nUR                  U R                  R                  5      n[        R                  " U R                  U/SS9n[        R                  " Xw4SS9nUR                  5       UR                  5       4n	U R                  U5      n
U R!                  U
5      n
U R#                  U
U	UUUS9nUS   nUS S 2SS S 24   nU R%                  U5      nU(       d	  XÍ4USS  -   $ ['        UUUR(                  UR*                  S9$ )	Nz You have to specify pixel_valueséþÿÿÿrS   r   rT   )r¿   rŒ   r±   rÀ   rÁ   r9   )rÈ   Úpooler_outputr‹   rÉ   )rn   rÀ   rÊ   r±   Ú
ValueErrorrv   r*   r×   ry   rÛ   rR   rW   r|   r¤   r¥   r   Úpre_layrnormÚencoderÚpost_layernormr
   r‹   rÉ   )r1   rt   r±   rÀ   rÁ   rN   rO   rf   ÚembrŒ   r‹   Úencoder_outputsrÈ   Úpooled_outputs                 r4   rg   ÚMLCDVisionTransformer.forward…  s¿  € ð %9Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ1BÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐàÑÜÐ?Ó@Ð@à)×/Ñ/°Ñ3°t·{±{×7MÑ7MÑMÐØ(×.Ñ.¨rÑ2°d·k±k×6LÑ6LÑLÐØ×5Ñ5Ð6HÓ\ˆØ'×*Ñ*¨4×+=Ñ+=×+DÑ+DÓEˆÜŸš D×$6Ñ$6¸Ð#GÈQÑOˆÜiŠi˜Ð8¸bÑAˆØ"Ÿw™w›y¨#¯'©'«)Ð4ÐàŸ™¨Ó5ˆØ×)Ñ)¨-Ó8ˆàŸ,™,Ø'Ø 3Ø/Ø!5Ø#ð 'ð 
ˆð ,¨AÑ.ÐØ)ª!¨Q²¨'Ñ2ˆØ×+Ñ+¨MÓ:ˆæØ%Ð5¸ÈÈÐ8KÑKÐKä)Ø/Ø'Ø)×7Ñ7Ø&×1Ñ1ñ	
ð 	
r6   )rÛ   r×   rÓ   )r>   r?   r@   rA   r   r#   r   r   rW   rƒ   rº   r   rª   r
   rg   rE   rF   rG   s   @r4   rÕ   rÕ     s†   ø† ðqÐ/÷ qð
 ð 59Ø,0Ø/3Ø&*ñ/
à˜u×0Ñ0Ñ1ð/
ð $ D™>ð/
ð ' t™nð	/
ð
 ˜d‘^ð/
ð 
ˆuÐ0Ð0Ñ	1ô/
ó ö/
r6   rÕ   c                   ó6   • \ rS rSr% \\S'   SrSrSrSr	S r
Srg)ÚMLCDPreTrainedModeli¸  rn   ÚmlcdTc                 óT  • U R                   R                  n[        U[        5      (       a™  U R                   R                  n[        R
                  R                  UR                  SUR                  S-  U-  S9  [        R
                  R                  UR                  R                  UR                   R                  U-  S9  g[        U[        5      (       Ga   U R                   R                  nUR                  S-  SUR                   R                  -  S-  -  U-  nUR                  S-  U-  n[        R
                  R                  UR                  R                  US9  [        R
                  R                  UR                  R                  US9  [        R
                  R                  UR                   R                  US9  [        R
                  R                  UR"                  R                  US9  g[        U[$        5      (       aÓ  U R                   R                  nUR                   R&                  S-  SUR                   R                  -  S-  -  U-  nSUR                   R&                  -  S-  U-  n[        R
                  R                  UR(                  R                  US9  [        R
                  R                  UR*                  R                  US9  g[        U[,        5      (       av  U R                   R                  nUR                   R&                  UR                   R.                  -  S-  S-  U-  n[        R
                  R                  UR0                  SUS9  g[        U[        R2                  5      (       aJ  UR4                  R6                  R9                  5         UR                  R6                  R;                  S5        g[        U[        R<                  5      (       a3  UR4                  b%  UR4                  R6                  R9                  5         ggg)zInitialize the weightsr<   g      à¿)ÚmeanÚstd)rí   r   r=   N)rn   r-   Ú
isinstancerl   rØ   ÚinitÚnormal_r{   Ú	embed_dimrw   rx   r,   r…   r&   r’   r–   r—   rŸ   rI   r$   Úfc1Úfc2rÕ   r'   rÛ   Ú	LayerNormÚbiasÚdataÚzero_Úfill_ÚLinear)r1   ÚmoduleÚfactorÚin_proj_stdÚout_proj_stdÚfc_stdÚpos_emb_stds          r4   Ú_init_weightsÚ!MLCDPreTrainedModel._init_weightsÀ  sð  € à—‘×/Ñ/ˆÜfÔ2×3Ñ3Ø—[‘[×3Ñ3ˆFÜG‰GO‰O˜F×2Ñ2¸À&×BRÑBRÐTXÑBXÐ[aÑBaˆOÑbÜG‰GO‰O˜F×2Ñ2×9Ñ9¸v¿}¹}×?^Ñ?^ÐagÑ?gˆOÒhÜ˜¤×.Ò.Ø—[‘[×3Ñ3ˆFØ!×+Ñ+¨TÑ1°q¸6¿=¹=×;ZÑ;ZÑ7ZÐ_cÑ6cÑdÐgmÑmˆKØ"×,Ñ,¨dÑ2°fÑ<ˆLÜG‰GO‰O˜FŸM™M×0Ñ0°kˆOÑBÜG‰GO‰O˜FŸM™M×0Ñ0°kˆOÑBÜG‰GO‰O˜FŸM™M×0Ñ0°kˆOÑBÜG‰GO‰O˜FŸO™O×2Ñ2¸ˆOÒEÜ˜¤×(Ñ(Ø—[‘[×3Ñ3ˆFØ!Ÿ=™=×4Ñ4°dÑ:ÀÀFÇMÁM×DcÑDcÑ@cÐhlÑ?lÑmÐpvÑvˆKØ˜&Ÿ-™-×3Ñ3Ñ3¸Ñ<¸vÑEˆFÜG‰GO‰O˜FŸJ™J×-Ñ-°6ˆOÑ:ÜG‰GO‰O˜FŸJ™J×-Ñ-°;ˆOÒ?Ü˜Ô 5×6Ñ6Ø—[‘[×3Ñ3ˆFØ!Ÿ=™=×4Ñ4¸¿¹×8YÑ8YÑYÐ]^Ñ^ÐcgÑgÐjpÑpˆKÜG‰GO‰O˜F×0Ñ0°sÀˆOÒLÜ˜¤§¡×-Ñ-ØK‰K×Ñ×"Ñ"Ô$ØM‰M×Ñ×$Ñ$ SÕ)Ü˜¤§	¡	×*Ñ*¨v¯{©{Ñ/FØK‰K×Ñ×"Ñ"Õ$ð 0GÐ*r6   r!   N)r>   r?   r@   rA   r   Ú__annotations__Úbase_model_prefixÚsupports_gradient_checkpointingÚ_supports_flash_attnÚ_supports_sdpar   rE   r!   r6   r4   ré   ré   ¸  s$   ‡ àÓØÐØ&*Ð#ØÐØ€Nõ%r6   ré   c                   ó~   • \ rS rSr\    S
S\\R                     S\\   S\\   S\\   S\	\
\4   4
S jj5       rS	rg)ÚMLCDVisionModelià  Nrt   r±   rÀ   rÁ   rP   c                 óÈ   • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUS9$ )a"  
Example:

```python
>>> import requests
>>> from PIL import Image
>>> from transformers import AutoProcessor, MLCDVisionModel
>>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
>>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs, output_attentions=True)

>>> features = outputs.last_hidden_state
>>> print(f"Extracted features shape: {features.shape}")
>>> print(f"Number of attention layers: {len(outputs.attentions)}")
>>> print(f"Attention shape: {outputs.attentions[0].shape}")
```)rt   r±   rÀ   rÁ   )rn   rÀ   rÊ   r±   Úvision_model)r1   rt   r±   rÀ   rÁ   s        r4   rg   ÚMLCDVisionModel.forwardá  su   € ð> %9Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ1BÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà× Ñ Ø%Ø/Ø!5Ø#ð	 !ð 
ð 	
r6   r!   rÓ   )r>   r?   r@   rA   r   r   rW   rƒ   rº   r   rª   r
   rg   rE   r!   r6   r4   r  r  à  st   † Øð 59Ø,0Ø/3Ø&*ñ(
à˜u×0Ñ0Ñ1ð(
ð $ D™>ð(
ð ' t™nð	(
ð
 ˜d‘^ð(
ð 
ˆuÐ0Ð0Ñ	1ô(
ó ó(
r6   r  )r   ré   r  )1Útypingr   r   r   rW   Útorch.nnrØ   Úconfiguration_utilsr   Úmodeling_flash_attention_utilsr   Úmodeling_outputsr	   r
   Úmodeling_utilsr   r   Úprocessing_utilsr   Úutilsr   r   Úclip.modeling_clipr   r   r   r   r   r   r   Úllama.modeling_llamar   Úqwen2_vl.modeling_qwen2_vlr   r   Ú
get_loggerr>   Úloggerr   rI   rL   rl   r…   r¬   r¼   rÕ   ré   r  Ú__all__r!   r6   r4   Ú<module>r     sí   ð÷ -Ñ ,ã Ý å 3Ý Bß Kß FÝ &ß ,÷÷ ñ õ ;ß [ð 
×	Ò	˜HÓ	%€ôY%Ð'ô Y%ôx	ˆgô 	ôÐ/ô ôDÐ/ô ô$9)Mô 9)ôx/Ð'ô /ôdP
+ô P
ôf6
Ð1ô 6
ðr ô$%˜/ó $%ó ð$%ôN*
oô *
òZr6   