ó
    <±hc8  ã                   ó  • S r SSKJr  SSKJr  SSKJrJrJr  \(       a  SSK	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  \R$                  " \5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r/ SQrg)zOWL-ViT model configurationé    ©ÚOrderedDict)ÚMapping)ÚTYPE_CHECKINGÚAnyÚOptionalé   )ÚProcessorMixin)Ú
TensorType)ÚPretrainedConfig)Ú
OnnxConfig)Úloggingc                   óT   ^ • \ rS rSrSrSrSr              SU 4S jjrSrU =r	$ )ÚOwlViTTextConfigé"   aÃ  
This is the configuration class to store the configuration of an [`OwlViTTextModel`]. It is used to instantiate an
OwlViT text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the OwlViT
[google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the OWL-ViT text model. Defines the number of different tokens that can be represented
        by the `inputs_ids` passed when calling [`OwlViTTextModel`].
    hidden_size (`int`, *optional*, defaults to 512):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    max_position_embeddings (`int`, *optional*, defaults to 16):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    pad_token_id (`int`, *optional*, defaults to 0):
        The id of the padding token in the input sequences.
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the input sequences.
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the input sequences.

Example:

```python
>>> from transformers import OwlViTTextConfig, OwlViTTextModel

>>> # Initializing a OwlViTTextModel with google/owlvit-base-patch32 style configuration
>>> configuration = OwlViTTextConfig()

>>> # Initializing a OwlViTTextConfig from the google/owlvit-base-patch32 style configuration
>>> model = OwlViTTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Úowlvit_text_modelÚtext_configc                 ó°   >• [         TU ]  " SXÍUS.UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        X€l	        Xl
        X l        X°l        g )N)Úpad_token_idÚbos_token_idÚeos_token_id© )ÚsuperÚ__init__Ú
vocab_sizeÚhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚmax_position_embeddingsÚ
hidden_actÚlayer_norm_epsÚattention_dropoutÚinitializer_rangeÚinitializer_factor)Úselfr   r   r   r   r   r    r!   r"   r#   r$   r%   r   r   r   ÚkwargsÚ	__class__s                   €Úg/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/owlvit/configuration_owlvit.pyr   ÚOwlViTTextConfig.__init__a   s^   ø€ ô$ 	‰ÒÐs lÐ\hÑsÐlrÒsà$ŒØ&ÔØ!2ÔØ!2ÔØ#6Ô Ø'>Ô$Ø$ŒØ,ÔØ!2ÔØ!2ÔØ"4Õó    )r#   r!   r   r%   r$   r   r"   r    r   r   r   )i Á  é   i   é   é   é   Ú
quick_geluçñhãˆµøä>ç        ç{®Gáz”?ç      ð?r   iþÀ  iÿÀ  ©
Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú
model_typeÚbase_config_keyr   Ú__static_attributes__Ú__classcell__©r(   s   @r)   r   r   "   sK   ø† ñ9ðv %€JØ#€Oð ØØØØØ "ØØØØØØØØ÷5õ 5r+   r   c                   óP   ^ • \ rS rSrSrSrSr            SU 4S jjrSrU =r	$ )ÚOwlViTVisionConfigé‚   a¼
  
This is the configuration class to store the configuration of an [`OwlViTVisionModel`]. It is used to instantiate
an OWL-ViT image encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the OWL-ViT
[google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 768):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 32):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).

Example:

```python
>>> from transformers import OwlViTVisionConfig, OwlViTVisionModel

>>> # Initializing a OwlViTVisionModel with google/owlvit-base-patch32 style configuration
>>> configuration = OwlViTVisionConfig()

>>> # Initializing a OwlViTVisionModel model from the google/owlvit-base-patch32 style configuration
>>> model = OwlViTVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Úowlvit_vision_modelÚvision_configc                 ó¶   >• [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        X€l	        Xl
        X l        X°l        XÀl        g )Nr   )r   r   r   r   r   r   Únum_channelsÚ
image_sizeÚ
patch_sizer!   r"   r#   r$   r%   )r&   r   r   r   r   rF   rG   rH   r!   r"   r#   r$   r%   r'   r(   s                 €r)   r   ÚOwlViTVisionConfig.__init__º   sZ   ø€ ô  	‰ÒÑ"˜6Ò"à&ÔØ!2ÔØ!2ÔØ#6Ô Ø(ÔØ$ŒØ$ŒØ$ŒØ,ÔØ!2ÔØ!2ÔØ"4Õr+   )r#   r!   r   rG   r%   r$   r   r"   r   rF   r   rH   )é   i   r-   r-   r	   rJ   é    r0   r1   r2   r3   r4   r5   r?   s   @r)   rA   rA   ‚   sE   ø† ñ2ðh '€JØ%€Oð ØØØØØØØØØØØ÷5õ 5r+   rA   c                   ód   ^ • \ rS rSrSrSr\\S.r     S
U 4S jjr	\
S\S\4S j5       rS	rU =r$ )ÚOwlViTConfigéÚ   ai  
[`OwlViTConfig`] is the configuration class to store the configuration of an [`OwlViTModel`]. It is used to
instantiate an OWL-ViT model according to the specified arguments, defining the text model and vision model
configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the OWL-ViT
[google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`OwlViTTextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`OwlViTVisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter. Default is used as per the original OWL-ViT
        implementation.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return a dictionary. If `False`, returns a tuple.
    kwargs (*optional*):
        Dictionary of keyword arguments.
Úowlvit)r   rD   c                 ó   >• [         TU ]  " S0 UD6  Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [	        S0 UD6U l        [        S0 UD6U l        X0l        X@l	        XPl
        SU l        g )NzKtext_config is None. Initializing the OwlViTTextConfig with default values.zOvision_config is None. initializing the OwlViTVisionConfig with default values.r4   r   )r   r   ÚloggerÚinfor   r   rA   rD   Úprojection_dimÚlogit_scale_init_valueÚreturn_dictr%   )r&   r   rD   rS   rT   rU   r'   r(   s          €r)   r   ÚOwlViTConfig.__init__÷   s   ø€ ô 	‰ÒÑ"˜6Ò"àÑØˆKÜK‰KÐeÔfàÑ ØˆMÜK‰KÐiÔjä+Ñ:¨kÑ:ˆÔÜ/Ñ@°-Ñ@ˆÔà,ÔØ&<Ô#Ø&ÔØ"%ˆÕr+   r   rD   c                 ó<   • 0 nXS'   X$S'   U R                   " U40 UD6$ )zÇ
Instantiate a [`OwlViTConfig`] (or a derived class) from owlvit text model configuration and owlvit vision
model configuration.

Returns:
    [`OwlViTConfig`]: An instance of a configuration object
r   rD   )Ú	from_dict)Úclsr   rD   r'   Úconfig_dicts        r)   Úfrom_text_vision_configsÚ%OwlViTConfig.from_text_vision_configs  s-   € ð ˆØ%0MÑ"Ø'4OÑ$à}Š}˜[Ñ3¨FÑ3Ð3r+   )r%   rT   rS   rU   r   rD   )NNr,   gƒ/L¦
F@T)r6   r7   r8   r9   r:   r;   r   rA   Úsub_configsr   ÚclassmethodÚdictr[   r=   r>   r?   s   @r)   rM   rM   Ú   sR   ø† ñð2 €JØ"2ÐEWÑX€Kð ØØØ%Ø÷&ð6 ð4°4ð 4Èó 4ó ö4r+   rM   c                   óâ   ^ • \ rS rSr\S\\\\\4   4   4S j5       r\S\\\\\4   4   4S j5       r	\S\
4S j5       r   SSSS\S	\S
\S   S\\\4   4
U 4S jjjr\S\4S j5       rSrU =r$ )ÚOwlViTOnnxConfigi"  Úreturnc           	      ó@   • [        SSSS.4SSSSSS	.4S
SSS.4/5      $ )NÚ	input_idsÚbatchÚsequence)r   é   Úpixel_valuesrF   ÚheightÚwidth)r   rg   é   r	   Úattention_maskr   ©r&   s    r)   ÚinputsÚOwlViTOnnxConfig.inputs#  s@   € äà '¨jÑ9Ð:Ø W°ÀHÐQXÑ!YÐZØ! w°:Ñ#>Ð?ðó
ð 	
r+   c                 ó@   • [        SSS04SSS04SSS04SSS04/5      $ )NÚlogits_per_imager   re   Úlogits_per_textÚtext_embedsÚimage_embedsr   rm   s    r)   ÚoutputsÚOwlViTOnnxConfig.outputs-  sD   € äà# a¨ \Ð2Ø" Q¨ LÐ1Ø  G Ð-Ø ! W Ð.ð	ó
ð 	
r+   c                 ó   • g)Ng-Cëâ6?r   rm   s    r)   Úatol_for_validationÚ$OwlViTOnnxConfig.atol_for_validation8  s   € àr+   Ú	processorr
   Ú
batch_sizeÚ
seq_lengthÚ	frameworkr   c                 ór   >• [         TU ]  UR                  X#US9n[         TU ]  UR                  X$S9n0 UEUE$ )N)r{   r|   r}   )r{   r}   )r   Úgenerate_dummy_inputsÚ	tokenizerÚimage_processor)r&   rz   r{   r|   r}   Útext_input_dictÚimage_input_dictr(   s          €r)   r   Ú&OwlViTOnnxConfig.generate_dummy_inputs<  s\   ø€ ô  ™'Ñ7Ø×Ñ¨JÐYbð 8ð 
ˆô !™7Ñ8Ø×%Ñ%°*ð 9ð 
Ðð 7/Ð6Ð%5Ð6Ð6r+   c                 ó   • g)Né   r   rm   s    r)   Údefault_onnx_opsetÚ#OwlViTOnnxConfig.default_onnx_opsetK  s   € àr+   r   )éÿÿÿÿr‰   N)r6   r7   r8   r9   Úpropertyr   ÚstrÚintrn   ru   Úfloatrx   r   r   r   r‡   r=   r>   r?   s   @r)   ra   ra   "  sæ   ø† Øð
˜  W¨S°#¨XÑ%6Ð 6Ñ7ó 
ó ð
ð ð
˜  g¨c°3¨hÑ&7Ð!7Ñ8ó 
ó ð
ð ð Uó ó ðð ØØ,0ñ7à#ð7ð ð7ð ð	7ð
 ˜LÑ)ð7ð 
cÑ	÷7ð 7ð ð Có ó ör+   ra   )rM   ra   r   rA   N)r:   Úcollectionsr   Úcollections.abcr   Útypingr   r   r   Úprocessing_utilsr
   Úutilsr   Úconfiguration_utilsr   Úonnxr   r   Ú
get_loggerr6   rQ   r   rA   rM   ra   Ú__all__r   r+   r)   Ú<module>r—      s   ðñ "å #Ý #ß /Ñ /ö Ý2Ý#å 3Ý Ý ð 
×	Ò	˜HÓ	%€ô]5Ð'ô ]5ô@U5Ð)ô U5ôpE4Ð#ô E4ôP+zô +ò\ Yr+   