
    <h                        S r SSKrSSKrSSKJr  SSKJrJrJrJ	r	  SSK
r
SSKr
SSK
Jr  SSKJr  SSKJrJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJrJrJrJrJr  SSK J!r!J"r"J#r#  \RH                  " \%5      r&S\
RN                  S\
RN                  4S jr(S\
RN                  S\
RN                  4S jr)\\ " S S\5      5       5       r*\\ " S S\5      5       5       r+\\ " S S\5      5       5       r, " S S\RZ                  5      r. " S S\RZ                  5      r/ SCS\RZ                  S\
RN                  S \
RN                  S!\
RN                  S"\\
RN                     S#\0S$\04S% jjr1 " S& S'\RZ                  5      r2 " S( S)\RZ                  5      r3 " S* S+\5      r4\ " S, S-\5      5       r5 " S. S/\RZ                  5      r6 " S0 S1\RZ                  5      r7 " S2 S3\55      r8 " S4 S5\RZ                  5      r9 " S6 S7\55      r:\ " S8 S9\55      5       r; " S: S;\RZ                  5      r< " S< S=\55      r=\" S>S?9 " S@ SA\55      5       r>/ SBQr?g)DzPyTorch CLIPSeg model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_lossr&   (   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)r&   t)r(   caption_loss
image_losss      r%   clipseg_lossr-   -   s*    #J/L!*,,.1J%,,r'   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)CLIPSegOutput3   aq  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`CLIPSegTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`CLIPSegVisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r6   r7   Ngetattrto_tuple.0kselfs     r%   	<genexpr>)CLIPSegOutput.to_tuple.<locals>.<genexpr>S   s<      
   LLDGRYZ^`aRbRkRkRmm    14tuplekeysr@   s   `r%   r<   CLIPSegOutput.to_tupleR   #     
YY[
 
 	
r'    )__name__
__module____qualname____firstlineno____doc__r1   r   r"   FloatTensor__annotations__r2   r3   r4   r5   r6   r   r7   rE   r   r<   __static_attributes__rJ   r'   r%   r/   r/   3   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r'   r/   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)CLIPSegDecoderOutputY   zp
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
    Classification scores for each pixel.
Nr   hidden_states
attentionsrJ   )rK   rL   rM   rN   rO   r   r   r"   rP   rQ   rV   rE   rW   rR   rJ   r'   r%   rT   rT   Y   sR    
 +/FHU&&'.8<M8E%"3"345<59Ju00129r'   rT   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S'   Sr\\	S	'   S
\\   4S jrSrg)CLIPSegImageSegmentationOutputf   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Binary cross entropy loss for segmentation.
logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
    Classification scores for each pixel.
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
    Conditional embeddings used for segmentation.
pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
    Pooled output of the [`CLIPSegVisionModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`CLIPSegVisionModel`].
decoder_output (`CLIPSegDecoderOutput`):
    The output of the [`CLIPSegDecoder`].
Nr1   r   conditional_embeddingspooled_outputr7   decoder_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r7   r]   Nr:   r=   s     r%   rA   :CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>   s;      
   IIDGwW[]^O_OhOhOjj rC   rD   rG   s   `r%   r<   'CLIPSegImageSegmentationOutput.to_tuple   rI   r'   rJ   )rK   rL   rM   rN   rO   r1   r   r"   rP   rQ   r   r[   r\   r7   r   r]   rT   rE   r   r<   rR   rJ   r'   r%   rY   rY   f   s     )-D(5$$
%,*.FHU&&'.:>HU%6%67>15M8E--.56:3:+/N(/
%* 
r'   rY   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )CLIPSegVisionEmbeddings   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__re   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr"   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr#   expandr@   re   	__class__s     r%   rs    CLIPSegVisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr'   
embeddingsheightwidthr   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nro   g      ?r	   rl   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer"   jit
is_tracingrm   rw   r   reshapepermuter   r    interpolateviewcat)r@   r   r   r   r~   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r%   interpolate_pos_encoding0CLIPSegVisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr'   pixel_valuesc                    UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  U5      nUR	                  S5      R                  SS5      nU R                  R                  USS5      n[        R                  " X/SS9n	U(       a  XR                  XU5      -   n	U	$ XR                  U R                  5      -   n	U	$ )	NzInput image size (*z) doesn't match model ().rl   r   ro   r   )r   rv   
ValueErrorr}   flatten	transposerz   r   r"   r   r   r   rm   )
r@   r   r   
batch_size_r   r   patch_embedsclass_embedsr   s
             r%   forwardCLIPSegVisionEmbeddings.forward   s   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++L9#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr'   )	rz   re   ru   rv   r~   r   r}   rw   r   )T)rK   rL   rM   rN   r   rs   r"   Tensorintr   rP   r   rR   __classcell__r   s   @r%   rc   rc      sj    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Y^YeYe  r'   rc   c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )CLIPSegTextEmbeddings   re   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nrm   rn   Frp   )rr   rs   rt   r   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r"   r#   r   r@   re   ru   r   s      r%   rs   CLIPSegTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r'   	input_idsrm   inputs_embedsr   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nro   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r   r   r   rm   r   )r@   r   rm   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r%   r   CLIPSegTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r'   )r   r   )NNN)rK   rL   rM   rN   r   rs   r   r"   
LongTensorrP   r   r   rR   r   r   s   @r%   r   r      sp    

0 

 153759	E,,- u//0   1 12	
 
 r'   r   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nro   r   )r   dtype)ptrainingr   rl   )r"   matmulr   r   r    softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r%   eager_attention_forwardr     s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r'   c                      ^  \ rS rSrSrS\\\4   4U 4S jjr   SS\	R                  S\\	R                     S\\	R                     S\\   S	\\	R                  \\	R                     4   4
S
 jjrSrU =r$ )CLIPSegAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperre   c                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rr   rs   re   rt   ru   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r%   rs   CLIPSegAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar'   rV   r   causal_attention_maskoutput_attentionsr   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                   (       d  SOU R"                  S	9u  pUR%                  XVU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )
z#Input shape: Batch x Time x Channelr   rl   flash_attention_2Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r   r   r   r   r   r   r   r   re   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )r@   rV   r   r   r   r   r   ru   queriesrF   valuesattention_interfacer   r   s                 r%   r   CLIPSegAttention.forward1  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r'   )re   r   ru   r   r   r   r   r   r   r   r   )NNF)rK   rL   rM   rN   rO   r   r   r   rs   r"   r   r   boolrE   r   rR   r   r   s   @r%   r   r     s    GBu%8:K%KL B. 268<,16)||6) !.6)  (5	6)
 $D>6) 
u||Xell33	46) 6)r'   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
CLIPSegMLPik  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)rr   rs   re   r
   
hidden_actactivation_fnr   r   rt   intermediate_sizefc1fc2r   s     r%   rs   CLIPSegMLP.__init__l  sb    #F$5$5699V//1I1IJ99V55v7I7IJr'   rV   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )r@   rV   s     r%   r   CLIPSegMLP.forwards  s4    /**=9/r'   )r   re   r   r   )
rK   rL   rM   rN   rs   r"   r   r   rR   r   r   s   @r%   r   r   k  s)    KU\\ ell  r'   r   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )CLIPSegEncoderLayeri{  re   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g N)epsrr   rs   rt   ru   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r%   rs   CLIPSegEncoderLayer.__init__|  m    ++)&1<<F<Q<QRf%<<F<Q<QRr'   rV   r   r   r   r   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rV   r   r   r   )r  r
  r  r  r@   rV   r   r   r   residualr   outputss           r%   r   CLIPSegEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr'   ru   r  r  r  r
  F)rK   rL   rM   rN   r   rs   r"   r   r   r   rE   rP   r   rR   r   r   s   @r%   r  r  {  sk    S} S -2&||& &  %||	&
 $D>& 
u  	!& &r'   r  c                   .    \ rS rSr% \\S'   SrSrS rSr	g)CLIPSegPreTrainedModeli  re   clipTc                 F
   U R                   R                  n[        U[        5      (       ad  UR                  R
                  R                  R                  SUS-  S9  UR                  R
                  R                  R                  SUS-  S9  GO[        U[        5      (       a  U R                   R                  n[        R                  R                  UR                  SUR                  S-  U-  S9  [        R                  R                  UR                  R
                  UR                   R                  U-  S9  [        R                  R                  UR                  R
                  UR                   R                  U-  S9  GO[        U[         5      (       Ga!  U R                   R                  nUR                  S-  SUR                   R"                  -  S-  -  U-  nUR                  S-  U-  n[        R                  R                  UR$                  R
                  US9  [        R                  R                  UR&                  R
                  US9  [        R                  R                  UR(                  R
                  US9  [        R                  R                  UR*                  R
                  US9  GO[        U[,        5      (       a  U R                   R                  nUR                   R.                  S-  SUR                   R"                  -  S-  -  U-  nSUR                   R.                  -  S-  U-  n[        R                  R                  UR0                  R
                  US9  [        R                  R                  UR2                  R
                  US9  O[        U[4        5      (       a  [        R                  R                  UR6                  R
                  UR8                  S-  U R                   R                  -  S9  [        R                  R                  UR:                  R
                  UR<                  S-  U R                   R                  -  S9  [        U[        R>                  5      (       aI  UR@                  R                  RC                  5         UR
                  R                  RE                  S5        [        U[        RF                  5      (       a3  UR@                  b%  UR@                  R                  RC                  5         ggg)	zInitialize the weightsr   g{Gz?)meanstdr   )r  rl   g      ?N)$re   initializer_factor
isinstancer   r   r   datanormal_r   rc   r   initrz   ru   r}   initializer_ranger   num_hidden_layersr   r   r   r   r   rt   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr  rk   zero_fill_r   )r@   r   factorin_proj_stdout_proj_stdfc_stds         r%   _init_weights$CLIPSegPreTrainedModel._init_weights  s   //f344""))..66CVd]6S%%,,1199sQU9V 788[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 011[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE
++[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?--GGOO&&--))4/$++2P2PP   GGOO((//++T1DKK4R4RR  
 fbll++KK""$MM$$S)fbii((V[[-DKK""$ .E(r'   rJ   N)
rK   rL   rM   rN   r   rQ   base_model_prefixsupports_gradient_checkpointingr2  rR   rJ   r'   r%   r  r    s    &*#'%r'   r  c                      ^  \ rS rSrSrS\4U 4S jjr\     SS\\	R                     S\\	R                     S\\   S\\   S	\\   S
\\\4   4S jj5       rSrU =r$ )CLIPSegEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`CLIPSegEncoderLayer`].

Args:
    config: CLIPSegConfig
re   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rr   rs   re   r   
ModuleListranger&  r  layersgradient_checkpointing)r@   re   r   r   s      r%   rs   CLIPSegEncoder.__init__  sT    mm%PVPhPhJi$jJiQ%8%@Ji$jk&+# %ks   A%r   r   r   output_hidden_statesreturn_dictr   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       H0  u  pU(       a  Xy4-   nU" U	UUUS9nUS   n	U(       d  M(  XS   4-   nM2     U(       a  Xy4-   n[        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrJ   )r   r   r   )last_hidden_staterV   rW   )re   r   r>  use_return_dict	enumerater;  r   )r@   r   r   r   r   r>  r?  encoder_statesall_attentionsrV   idxencoder_layerlayer_outputss                r%   r   CLIPSegEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B)%"3	M *!,M  !/3C2E!E #9  +.>>N+Vd
 	
r'   )re   r<  r;  NNNNN)rK   rL   rM   rN   rO   r   rs   r   r   r"   r   r   r   rE   r   r   rR   r   r   s   @r%   r7  r7    s    ,} ,  268<,0/3&*C
 !.C
  (5	C

 $D>C
 'tnC
 d^C
 
uo%	&C
 C
r'   r7  c                      ^  \ rS rSrS\4U 4S jjr\      SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jj5       rSrU =r$ )CLIPSegTextTransformeri4  re   c                    > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        UR                  U l        g r  )rr   rs   re   rt   r   r   r7  encoderr   r  r  final_layer_normeos_token_idr   s      r%   rs   CLIPSegTextTransformer.__init__5  s]    &&	/7%f- "Y<Q<Q R #//r'   r   r   rm   r   r>  r?  r   c           	      0   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUR                  5       nUR                  SUS   5      nU R                  XS9n[        XxR                  UR                  S9n	Ub  [        X(R                  5      nU R                  UUU	UUUS9n
U
S   nU R                  U5      nU R                  S:X  ae  U[        R                   " UR"                  S   UR                  S9UR%                  [        R&                  UR                  S9R)                  SS	94   nOU[        R                   " UR"                  S   UR                  S9UR%                  [        R&                  UR                  S9U R                  :H  R'                  5       R)                  SS	94   nU(       d	  X4U
S
S  -   $ [+        UUU
R,                  U
R.                  S9$ )NzYou have to specify input_idsro   )r   rm   r   )r   r   r   r   r>  r?  r   rl   )r   r   r   r   rA  pooler_outputrV   rW   )re   r   r>  rB  r   r   r   r   r   r   r   r   rN  rO  rP  r"   r#   r   r   r   argmaxr   rV   rW   )r@   r   r   rm   r   r>  r?  input_shaperV   r   encoder_outputsrA  r\   s                r%   r   CLIPSegTextTransformer.forward@  s&    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&NN2{27	)W !A,,]5I5I!
 %7H[H[\N,,')"7/!5# ' 
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M %58KKK)/')77&11	
 	
r'   )re   r   rN  rP  rO  NNNNNN)rK   rL   rM   rN   r   rs   r   r   r"   r   r   r   rE   r   r   rR   r   r   s   @r%   rL  rL  4  s    	00 	0  -115/3,0/3&*K
ELL)K
 !.K
 u||,	K

 $D>K
 'tnK
 d^K
 
u00	1K
 K
r'   rL  c                     ^  \ rS rSr% \\S'   SS/rS\4U 4S jjrS\R                  4S jr
S r\      SS	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )CLIPSegTextModeli  re   r   r  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rr   rs   rL  
text_model	post_initr   s     r%   rs   CLIPSegTextModel.__init__  s&     08r'   r   c                 B    U R                   R                  R                  $ r   r]  r   r   rG   s    r%   get_input_embeddings%CLIPSegTextModel.get_input_embeddings  s    ))999r'   c                 8    XR                   R                  l        g r   ra  )r@   r   s     r%   set_input_embeddings%CLIPSegTextModel.set_input_embeddings  s    5:""2r'   r   r   rm   r   r>  r?  c           	      *    U R                  UUUUUUS9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, CLIPSegTextModel

>>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r   r   rm   r   r>  r?  r]  )r@   r   r   rm   r   r>  r?  s          r%   r   CLIPSegTextModel.forward  s,    2 )%/!5#  
 	
r'   ri  rY  )rK   rL   rM   rN   r   rQ   _no_split_modulesrs   r   Modulerb  re  r   r   r"   r   r   r   rE   r   r   rR   r   r   s   @r%   r[  r[    s    02GH0 :bii :;  -115/3,0/3&*
ELL)
 !.
 u||,	

 $D>
 'tn
 d^
 
u00	1
 
r'   r[  c                      ^  \ rS rSrS\4U 4S jjr\    SS\\R                     S\\
   S\\
   S\\
   S\\
   S	\\\4   4S
 jj5       rSrU =r$ )CLIPSegVisionTransformeri  re   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r  )rr   rs   re   rt   rc   r   r   r  r  pre_layrnormr7  rN  post_layernormr   s      r%   rs   !CLIPSegVisionTransformer.__init__  sd    &&	1&9LL8M8MN%f- ll9:O:OPr'   r   r   r>  r?  r   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  XS9nU R                  U5      nU R                  UUUUS9nUS   nUS S 2SS S 24   n	U R                  U	5      n	U(       d	  X4USS  -   $ [        UU	UR                  UR                  S9$ )N)r   )r   r   r>  r?  r   r   rS  )re   r   r>  rB  r   rp  rN  rq  r   rV   rW   )
r@   r   r   r>  r?  r   rV   rW  rA  r\   s
             r%   r    CLIPSegVisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]h))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%58KKK)/')77&11	
 	
r'   )re   r   rN  rq  rp  )NNNT)rK   rL   rM   rN   r   rs   r   r   r"   rP   r   r   rE   r   r   rR   r   r   s   @r%   rn  rn    s    Q2 Q  -1/3&*37$
u001$
 $D>$
 'tn	$

 d^$
 #+4.$
 
u00	1$
 $
r'   rn  c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\\   S
\\   S\\\4   4S jj5       rSrU =r$ )CLIPSegVisionModeli  re   r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rr   rs   rn  vision_modelr^  r   s     r%   rs   CLIPSegVisionModel.__init__  s'     4V<r'   r   c                 B    U R                   R                  R                  $ r   )rx  r   r}   rG   s    r%   rb  'CLIPSegVisionModel.get_input_embeddings  s      ++;;;r'   r   r>  r   r?  c                 (    U R                  UUUUUS9$ )aj  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPSegVisionModel

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r   r   r>  r   r?  rx  )r@   r   r   r>  r   r?  s         r%   r   CLIPSegVisionModel.forward  s+    :   %/!5%=# ! 
 	
r'   r~  NNNTN)rK   rL   rM   rN   r   rQ   main_input_namers   r   rl  rb  r   r   r"   rP   r   r   rE   r   r   rR   r   r   s   @r%   rv  rv    s    $O2 <bii <  59,0/337&*"
u001"
 $D>"
 'tn	"

 #+4."
 d^"
 
u00	1"
 "
r'   rv  c                   F  ^  \ rS rSr% \\S'   S\4U 4S jjr\      SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\\   S
\	R                  4S jj5       r\     SS\\	R                     S\\   S\\   S\S	\\   S
\	R                  4S jj5       r\         SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\S	\\   S
\\\4   4S jj5       rSrU =r$ )r'  i*  re   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  Ul	        UR                  Ul	        UR                  U l
        UR                  U l        UR                  U l        [        U5      U l        [!        U5      U l        [$        R&                  " U R                  U R                  SS9U l        [$        R&                  " U R                  U R                  SS9U l        [$        R,                  " [.        R0                  " U R2                  R4                  5      5      U l        U R9                  5         g )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rk   )rr   rs   r!  text_configr   	TypeErrortypevision_configr   r   projection_dimrt   r)  r+  rL  r]  rn  rx  r   r   r*  r(  rx   r"   tensorre   logit_scale_init_valuelogit_scaler^  )r@   re   r  r  r   s       r%   rs   CLIPSegModel.__init__.  sy    &,,.?@@++,-Q0 
 &..0CDD--./q2 
 ((,,+1+F+F(-3-H-H*$33)55 - 9 90=4]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r'   r   r   rm   r   r>  r?  r   c           	          Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUS9nUS   nU R                  U5      n	U	$ )aG  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`CLIPSegTextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, CLIPSegModel

>>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```rh  r   )re   r   r>  rB  r]  r(  )
r@   r   r   rm   r   r>  r?  text_outputsr\   text_featuress
             r%   get_text_featuresCLIPSegModel.get_text_featuresR  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%/!5# ' 
 %Q,,];r'   r   r   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUS9nUS   nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPSegModel

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> image_features = model.get_image_features(**inputs)
```r}  r   )re   r   r>  rB  rx  r*  )	r@   r   r   r>  r   r?  vision_outputsr\   image_featuress	            r%   get_image_featuresCLIPSegModel.get_image_features  s    @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 'q)//>r'   return_lossc
           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R	                  UUUUU	S9n
U R                  UUUUUU	S9nU
S   nU R                  U5      nUS   nU R                  U5      nXR                  SSSS9-  nXR                  SSSS9-  nU R                  R                  5       n[        R                  " XR                  5       5      U-  nUR                  5       nSnU(       a  [        U5      nU	(       d  UXXU
4nUb  U4U-   $ U$ [        UUUUUUU
S	9$ )
aO  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPSegModel

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```Nr}  rh  r   rl   ro   T)r   r   keepdim)r1   r2   r3   r4   r5   r6   r7   )re   r   r>  rB  rx  r]  r*  r(  normr  expr"   r   r*   r-   r/   )r@   r   r   r   rm   r  r   r>  r   r?  r  r  r5   r4   r  r3   r2   r1   outputs                      r%   r   CLIPSegModel.forward  s   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 )%/!5# ' 
 &a(--l;"1o**;7 $&7&7!T&7&RR!$4$4qb$$4$OO &&**,,,{NN4DES*,,.0D&lbpqF)-)9TGf$EvE-+#%* .
 	
r'   )r  r  r)  r]  r(  r+  rx  r*  rY  r  )	NNNNNNNTN)rK   rL   rM   rN   r   rQ   rs   r   r   r"   r   r   rP   r  r  r   r   rE   r/   r   rR   r   r   s   @r%   r'  r'  *  s   "} "H  -115/3,0/3&*,ELL), !., u||,	,
 $D>, 'tn, d^, 
		, ,\  59,0/3)-&*0u0010 $D>0 'tn	0
 #'0 d^0 
		0 0d  15481537&*,0/3)-&*\
E,,-\
 u001\
 !.	\

 u//0\
 d^\
 $D>\
 'tn\
 #'\
 d^\
 
um#	$\
 \
r'   r'  c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\	\
   S	\\R                     4
S
 jjrSrU =r$ )CLIPSegDecoderLayeri  z
CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
self-attention/MLP, rather than before.
re   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r  r	  r   s     r%   rs   CLIPSegDecoderLayer.__init__  r  r'   rV   r   r   r   r   c                     UnU R                  UUUUS9u  pXQ-   nU R                  U5      nUnU R                  U5      nXQ-   nU R                  U5      nU4nU(       a  Xv4-  nU$ r  )r
  r  r  r  r  s           r%   r   CLIPSegDecoderLayer.forward#  s    " !&*nn')"7/	 '5 '
# !0((7 / 0((7 "&Gr'   r  r  )rK   rL   rM   rN   rO   r   rs   r"   r   r   r   rE   rP   r   rR   r   r   s   @r%   r  r    sp    S} S -2'||' '  %||	'
 $D>' 
u  	!' 'r'   r  c                      ^  \ rS rSrS\4U 4S jjr   SS\\R                     S\R                  S\	\
   S\	\
   S\	\
   4
S	 jjrS
rU =r$ )CLIPSegDecoderiM  re   c                   > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  5      U l        [        R                  " UR
                  UR                  5      U l        UR                  (       a  UR                  R                  S-  UR                  R                  S-  4n[        R                  " [        R                  " UR                  UR                  SSS9[        R                  " 5       [        R                  " UR                  UR                  S-  US   US   S9[        R                  " 5       [        R                  " UR                  S-  SUS   US   S95      U l        ON[        R                  " UR                  SUR                  R                  UR                  R                  S9U l        [#        UR$                  5      n[        R&                  " [)        U5       Vs/ sH8  n[        R                  " UR                  R*                  UR                  5      PM:     sn5      U l        [.        R0                  " UR                  5      nUR                  Ul        UR2                  Ul        UR6                  Ul        S	Ul        [        R&                  " [)        [#        UR$                  5      5       Vs/ sH  n[=        U5      PM     sn5      U l        g s  snf s  snf )
N   r	   r   )ri   paddingrl   r   )ri   rj   )rj   relu) rr   rs   conditional_layerr   r   r  
reduce_dimfilm_mulfilm_add"use_complex_transposed_convolutionr  rw   
Sequentialr{   ReLUConvTranspose2dtransposed_convolutionr$   extract_layersr9  r:  rt   reducescopydeepcopydecoder_num_attention_headsr   decoder_intermediate_sizer   r   r  r;  )r@   re   transposed_kernelsdepthr   decoder_configr   s         r%   rs   CLIPSegDecoder.__init__N  s]    !'!9!9		&"7"79J9JK		&"7"79J9JK44"("6"6"A"AQ"FH\H\HgHgklHl!m*,--		&++V->->AWXY	""%%%%* 21 5-a0	 	""%%*A;Ma;PYklmYn+D' +-*<*<!!1f&:&:&E&EfNbNbNmNm+D' F))*}}UZ[`UabUaPQRYYv++779J9JKUab
 v';';<%+%6%6"-3-O-O*+1+K+K($*!mmRWX[\b\q\qXrRs$tRsQ%8%HRs$tu c %us   >L"LrV   r[   r   r>  r?  c                 d   U(       a  SOS nU(       a  SOS nUS S S2   nS n	[        [        XR                  U R                  5      5       H  u  n
u  pnU	b  U" U5      U	-   n	OU" U5      n	XR                  :X  aJ  U R                  U5      U	R                  SSS5      -  U R                  U5      -   n	U	R                  SSS5      n	U" U	S S US9nUS   n	U(       a  Xi4-  nU(       d  M  X~S   4-  nM     U	S S 2SS 2S S 24   R                  SSS5      n	[        [        R                  " U	R                  S   5      5      nUR                  S   nU	R                  UU	R                  S   X5      n	U R                  U	5      R                  S5      nU(       d  [        S UXg4 5       5      $ [!        UUUS9$ )	NrJ   ro   r   r   rl   )r   r   r   c              3   ,   #    U H  oc  M  Uv   M     g 7fr   rJ   )r>   vs     r%   rA   )CLIPSegDecoder.forward.<locals>.<genexpr>  s     a$Oq$Os   	)r   rV   rW   )rC  zipr;  r  r  r  r   r  r   mathsqrtr   r   r  squeezerE   rT   )r@   rV   r[   r   r>  r?  all_hidden_statesrE  activationsr  i
activationlayerreducerH  r   r   r   s                     r%   r   CLIPSegDecoder.forwardx  s    #7BD0d#DbD).7KVZVbVb8c.d*A*
6!
+f4
+***'=>PQSTVWAXX[_[h[h*\   1a0!t4[lM #1%F#!Y.!  #3"55- /e0 12q!))!Q2499V\\!_-.+11!4
Za$E,,V4<<Q?aV->$Oaaa#+%
 	
r'   )r  r  r  r;  r  r  )NNT)rK   rL   rM   rN   r   rs   rE   r"   r   r   r   r   rR   r   r   s   @r%   r  r  M  sp    (v} (v\ -1/3&*6
U\\*6
 !&6
 $D>	6

 'tn6
 d^6
 6
r'   r  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                     ^  \ rS rSr% \\S'   S\4U 4S jjr     SS\\   S\\	R                     S\\	R                     S\\	R                     S\\	R                     4
S	 jjr\           SS\\	R                     S
\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\S\\   S\\\4   4S jj5       rSrU =r$ )CLIPSegForImageSegmentationi  re   c                    > [         TU ]  U5        Xl        [        U5      U l        UR
                  U l        [        U5      U l        U R                  5         g r   )	rr   rs   re   r'  r  r  r  decoderr^  r   s     r%   rs   $CLIPSegForImageSegmentation.__init__  sG      (	$33%f- 	r'   r   r   r   rm   conditional_pixel_valuesc                    UbT  [        U5      U:w  a  [        S5      e[        R                  " 5          U R                  R                  X#US9nS S S 5        U$ UbU  [        U5      U:w  a  [        S5      e[        R                  " 5          U R                  R                  U5      nS S S 5        U$ [        S5      e! , (       d  f       W$ = f! , (       d  f       W$ = f)Nz@Make sure to pass as many prompt texts as there are query images)r   rm   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r$   r   r"   no_gradr  r  r  )r@   r   r   r   rm   r  r[   s          r%   get_conditional_embeddings6CLIPSegForImageSegmentation.get_conditional_embeddings  s      9~+ !cdd)-)D)D< *E *& ! &% &1+,
: !dee)-)E)EF^)_& ! &%	 m  ! &% ! &%s   B;
C;
C

Cr   r[   labelsr   r>  r   r?  r   c           	      \   Ub  UOU R                   R                  n[        R                  " 5          U R                  R                  UUSU
US9nU R                  R                  US   5      nU(       a  UR                  OUS   nU R                   Vs/ sH
  oUS-      PM     nnU(       a>  [        UR                  UR                  U	(       a  UR                  OSUR                  S9nOU	(       d  USS USS -   OUnSSS5        Uc!  U R                  UR                  S   UUUUS	9nO]UR                  S   UR                  S   :w  a  [        S
5      eUR                  S   U R                   R                   :w  a  [        S5      eU R#                  WUUU	US9nU(       a  UR$                  OUS   nSnUb9  UR'                  UR(                  5      n[*        R,                  " 5       nU" UU5      nU(       d  UUWWU4nUb  U4U-   $ U$ [/        UUUWWUS9$ s  snf ! , (       d  f       GN%= f)a  
conditional_pixel_values (`torch.FloatTensor`, *optional*):
    The pixel values of the conditional images.
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
    The conditional embeddings for the query images. If provided, the model will use this instead of computing
    the embeddings from the conditional_pixel_values.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
>>> from PIL import Image
>>> import requests

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = ["a cat", "a remote", "a blanket"]
>>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> print(logits.shape)
torch.Size([3, 352, 352])
```NTr}  r   rl   rS  r	   r   )r   r   r   rm   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r>  r?  )r1   r   r[   r\   r7   r]   )re   rB  r"   r  r  rx  r*  rV   r  r   rA  rT  rW   r  r   r   r  r  r   r   r   r   BCEWithLogitsLossrY   )r@   r   r   r  r[   r   rm   r  r   r>  r   r?  r  r\   rV   r  r  decoder_outputsr   r1   loss_fnr  s                         r%   r   #CLIPSegForImageSegmentation.forward  sg   ^ &1%<k$++B]B] ]]_!YY33)"3%))A' 4 N !II77q8IJM<GN88^\]M^M9=9L9LM9LAQ/9LKM !;&4&F&F"0">">BV.">">\`-88	" DXN2A&);;]k / 8 ")%)%D%D'--a0#-))A &E &" &++A.,2D2DQ2GG m  &++A.$++2L2LL 0  ,,"/!5# ' 
 ,7''OA<NYYv}}-F**,G66*D4m^UdeF)-)9TGf$EvE-#9' .*
 	
q N _s   A"HH#AHH
H+)r  re   r  r  rJ  )NNNNNNNNNTN)rK   rL   rM   rN   r   rQ   rs   r   r   r"   r   r  r   rP   r   r   r   rE   r/   r   rR   r   r   s   @r%   r  r    s    }  %),015/3;?&SM& ELL)& !.	&
 u||,& #+5<<"8&:  2648@D>B1537-1,0/3)-&*|
E--.|
 u001|
 #+5+<+<"=	|

 !)):): ;|
 !.|
 u//0|
 ))*|
 $D>|
 'tn|
 #'|
 d^|
 
um#	$|
 |
r'   r  )r'  r  r[  rv  r  )r   )@rO   r  r  dataclassesr   typingr   r   r   r   r"   torch.utils.checkpointr   r  r
   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   r   r   configuration_clipsegr   r   r   
get_loggerrK   r   r   r&   r-   r/   rT   rY   rl  rc   r   floatr   r   r   r  r  r7  rL  r[  rn  rv  r'  r  r  r  __all__rJ   r'   r%   <module>r     s      ! 1 1    ! d 9 K F V V X X 
		H	%
`U\\ `ell `
-U\\ -ell -  
K  
   
F :; :  : 
[ 
  
<Pbii Ph%BII %` %II%<<% 
% <<	%
 U\\*% % %.M)ryy M)b  /4 /d ,%_ ,% ,%`S
RYY S
lX
RYY X
v1
- 1
h1
ryy 1
h0
/ 0
f f
) f
 f
R6")) 6ra
+ a
H 
j
"8 j

j
Zr'   