
    <hc8                        S r SSKJr  SSKJr  SSKJrJrJr  \(       a  SSK	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  \R$                  " \5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r/ SQrg)zOWL-ViT model configuration    OrderedDict)Mapping)TYPE_CHECKINGAnyOptional   )ProcessorMixin)
TensorType)PretrainedConfig)
OnnxConfig)loggingc                   T   ^  \ rS rSrSrSrSr              SU 4S jjrSrU =r	$ )OwlViTTextConfig"   a  
This is the configuration class to store the configuration of an [`OwlViTTextModel`]. It is used to instantiate an
OwlViT text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the OwlViT
[google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the OWL-ViT text model. Defines the number of different tokens that can be represented
        by the `inputs_ids` passed when calling [`OwlViTTextModel`].
    hidden_size (`int`, *optional*, defaults to 512):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    max_position_embeddings (`int`, *optional*, defaults to 16):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    pad_token_id (`int`, *optional*, defaults to 0):
        The id of the padding token in the input sequences.
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the input sequences.
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the input sequences.

Example:

```python
>>> from transformers import OwlViTTextConfig, OwlViTTextModel

>>> # Initializing a OwlViTTextModel with google/owlvit-base-patch32 style configuration
>>> configuration = OwlViTTextConfig()

>>> # Initializing a OwlViTTextConfig from the google/owlvit-base-patch32 style configuration
>>> model = OwlViTTextModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```owlvit_text_modeltext_configc                    > [         TU ]  " SXUS.UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        g )N)pad_token_idbos_token_ideos_token_id )super__init__
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsmax_position_embeddings
hidden_actlayer_norm_epsattention_dropoutinitializer_rangeinitializer_factor)selfr   r   r   r   r   r    r!   r"   r#   r$   r%   r   r   r   kwargs	__class__s                   g/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/owlvit/configuration_owlvit.pyr   OwlViTTextConfig.__init__a   s^    $ 	sl\hslrs$&!2!2#6 '>$$,!2!2"4    )r#   r!   r   r%   r$   r   r"   r    r   r   r   )i      i            
quick_geluh㈵>        {Gz?      ?r   i  i  
__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   __static_attributes____classcell__r(   s   @r)   r   r   "   sK    9v %J#O  "5 5r+   r   c                   P   ^  \ rS rSrSrSrSr            SU 4S jjrSrU =r	$ )OwlViTVisionConfig   a
  
This is the configuration class to store the configuration of an [`OwlViTVisionModel`]. It is used to instantiate
an OWL-ViT image encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the OWL-ViT
[google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 768):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 32):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_factor (`float`, *optional*, defaults to 1.0):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).

Example:

```python
>>> from transformers import OwlViTVisionConfig, OwlViTVisionModel

>>> # Initializing a OwlViTVisionModel with google/owlvit-base-patch32 style configuration
>>> configuration = OwlViTVisionConfig()

>>> # Initializing a OwlViTVisionModel model from the google/owlvit-base-patch32 style configuration
>>> model = OwlViTVisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```owlvit_vision_modelvision_configc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        Xl        Xl        g )Nr   )r   r   r   r   r   r   num_channels
image_size
patch_sizer!   r"   r#   r$   r%   )r&   r   r   r   r   rF   rG   rH   r!   r"   r#   r$   r%   r'   r(   s                 r)   r   OwlViTVisionConfig.__init__   sZ      	"6"&!2!2#6 ($$$,!2!2"4r+   )r#   r!   r   rG   r%   r$   r   r"   r   rF   r   rH   )   i   r-   r-   r	   rJ       r0   r1   r2   r3   r4   r5   r?   s   @r)   rA   rA      sE    2h 'J%O 5 5r+   rA   c                   d   ^  \ rS rSrSrSr\\S.r     S
U 4S jjr	\
S\S\4S j5       rS	rU =r$ )OwlViTConfig   ai  
[`OwlViTConfig`] is the configuration class to store the configuration of an [`OwlViTModel`]. It is used to
instantiate an OWL-ViT model according to the specified arguments, defining the text model and vision model
configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the OWL-ViT
[google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`OwlViTTextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`OwlViTVisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter. Default is used as per the original OWL-ViT
        implementation.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return a dictionary. If `False`, returns a tuple.
    kwargs (*optional*):
        Dictionary of keyword arguments.
owlvit)r   rD   c                    > [         TU ]  " S0 UD6  Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [	        S0 UD6U l        [        S0 UD6U l        X0l        X@l	        XPl
        SU l        g )NzKtext_config is None. Initializing the OwlViTTextConfig with default values.zOvision_config is None. initializing the OwlViTVisionConfig with default values.r4   r   )r   r   loggerinfor   r   rA   rD   projection_dimlogit_scale_init_valuereturn_dictr%   )r&   r   rD   rS   rT   rU   r'   r(   s          r)   r   OwlViTConfig.__init__   s     	"6"KKKef MKKij+:k:/@-@,&<#&"%r+   r   rD   c                 <    0 nXS'   X$S'   U R                   " U40 UD6$ )z
Instantiate a [`OwlViTConfig`] (or a derived class) from owlvit text model configuration and owlvit vision
model configuration.

Returns:
    [`OwlViTConfig`]: An instance of a configuration object
r   rD   )	from_dict)clsr   rD   r'   config_dicts        r)   from_text_vision_configs%OwlViTConfig.from_text_vision_configs  s-     %0M"'4O$}}[3F33r+   )r%   rT   rS   rU   r   rD   )NNr,   g/L
F@T)r6   r7   r8   r9   r:   r;   r   rA   sub_configsr   classmethoddictr[   r=   r>   r?   s   @r)   rM   rM      sR    2 J"2EWXK %&6 44 4 4 4r+   rM   c                      ^  \ rS rSr\S\\\\\4   4   4S j5       r\S\\\\\4   4   4S j5       r	\S\
4S j5       r   SSSS\S	\S
\S   S\\\4   4
U 4S jjjr\S\4S j5       rSrU =r$ )OwlViTOnnxConfigi"  returnc           	      @    [        SSSS.4SSSSSS	.4S
SSS.4/5      $ )N	input_idsbatchsequence)r      pixel_valuesrF   heightwidth)r   rg      r	   attention_maskr   r&   s    r)   inputsOwlViTOnnxConfig.inputs#  s@    'j9:WHQX!YZ!w:#>?
 	
r+   c                 @    [        SSS04SSS04SSS04SSS04/5      $ )Nlogits_per_imager   re   logits_per_texttext_embedsimage_embedsr   rm   s    r)   outputsOwlViTOnnxConfig.outputs-  sD    #a\2"QL1G-!W.	
 	
r+   c                     g)Ng-C6?r   rm   s    r)   atol_for_validation$OwlViTOnnxConfig.atol_for_validation8  s    r+   	processorr
   
batch_size
seq_length	frameworkr   c                 r   > [         TU ]  UR                  X#US9n[         TU ]  UR                  X$S9n0 UEUE$ )N)r{   r|   r}   )r{   r}   )r   generate_dummy_inputs	tokenizerimage_processor)r&   rz   r{   r|   r}   text_input_dictimage_input_dictr(   s          r)   r   &OwlViTOnnxConfig.generate_dummy_inputs<  s\      '7JYb 8 
 !78%%* 9 
 7/6%566r+   c                     g)N   r   rm   s    r)   default_onnx_opset#OwlViTOnnxConfig.default_onnx_opsetK  s    r+   r   )r   N)r6   r7   r8   r9   propertyr   strintrn   ru   floatrx   r   r   r   r   r=   r>   r?   s   @r)   ra   ra   "  s    
WS#X%6 67 
 
 
gc3h&7!78 
 
 U   ,07#7 7 	7
 L)7 
c	7 7 C  r+   ra   )rM   ra   r   rA   N)r:   collectionsr   collections.abcr   typingr   r   r   processing_utilsr
   utilsr   configuration_utilsr   onnxr   r   
get_loggerr6   rQ   r   rA   rM   ra   __all__r   r+   r)   <module>r      s    " # # / / 2# 3   
		H	%]5' ]5@U5) U5pE4# E4P+z +\ Yr+   