
    <hz)                        S r SSKJr  SSKJr  SSKrSSKJr  SSKJr  SSK	J
r
Jr  SS	KJr  S
SKJr  \\" SS9 " S S\
5      5       5       r\ " S S\5      5       r " S S\R$                  5      r " S S\R$                  5      r " S S\R$                  5      r " S S\R$                  5      r " S S\R$                  5      r\" SS9 " S S\5      5       rSS/rg)zPyTorch ViTMatte model.    )	dataclass)OptionalN)nn   )PreTrainedModel)ModelOutputauto_docstring)load_backbone   )VitMatteConfigz4
    Class for outputs of image matting models.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	ImageMattingOutput   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Loss.
alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Estimated alpha values.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
    (also called feature maps) of the model at the output of each stage.
Nlossalphashidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   tupler   __static_attributes__r       f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/vitmatte/modeling_vitmatte.pyr   r      sg    	 )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<59Ju00129r    r   c                   N    \ rS rSr% \\S'   SrSr/ rS\	R                  4S jrSrg)	VitMattePreTrainedModel5   configpixel_valuesTmodulec                 8   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         g g g )Ng        )meanstd)
isinstancer   Conv2dBatchNorm2dweightdatanormal_r%   initializer_rangebiaszero_)selfr'   s     r!   _init_weights%VitMattePreTrainedModel._init_weights<   sm    fryy"..9::MM&&CT[[5R5R&S{{&  &&( ' ;r    r   N)r   r   r   r   r   r   main_input_namesupports_gradient_checkpointing_no_split_modulesr   Moduler5   r   r   r    r!   r#   r#   5   s)    $O&*#)BII )r    r#   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )VitMatteBasicConv3x3C   zH
Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
c           	         > [         TU ]  5         [        R                  " UUSUUSS9U l        [        R
                  " X1R                  S9U l        [        R                  " 5       U l	        g )Nr   F)in_channelsout_channelskernel_sizestridepaddingr2   )eps)
super__init__r   r,   convr-   batch_norm_eps
batch_normReLUrelu)r4   r%   r?   r@   rB   rC   	__class__s         r!   rF   VitMatteBasicConv3x3.__init__H   sU    II#%
	 ..;P;PQGGI	r    c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ N)rG   rI   rK   r4   hidden_states     r!   forwardVitMatteBasicConv3x3.forwardU   s2    yy.|4yy.r    )rI   rG   rK   )   r   	r   r   r   r   r   rF   rR   r   __classcell__rL   s   @r!   r<   r<   C   s     r    r<   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteConvStream]   z[
Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
c                   > [         TU ]  5         SnUR                  b  UR                  R                  nUR                  n[
        R                  " 5       U l        U/U-   U l        [        [        U R                  5      S-
  5       HI  nU R                  U   nU R                  US-      nU R                  R                  [        XU5      5        MK     g )N   r   )rE   rF   backbone_confignum_channelsconvstream_hidden_sizesr   
ModuleListconvs
conv_chansrangelenappendr<   )r4   r%   r?   r@   iin_chan_	out_chan_rL   s          r!   rF   VitMatteConvStream.__init__b   s     !!- 00==K55]]_
&-,6s4??+a/0Aq)HA.IJJ26YOP 1r    c                     SU0nUn[        [        U R                  5      5       H-  nU R                  U   " U5      nS[        US-   5      -   nX2U'   M/     U$ )Ndetailed_feature_map_0detailed_feature_map_r   )rc   rd   ra   str)r4   r&   out_dict
embeddingsrf   name_s         r!   rR   VitMatteConvStream.forwardu   sZ    ,l;!
s4::'AAz2J+c!a%j8E(UO (
 r    )rb   ra   rU   rW   s   @r!   rY   rY   ]   s    Q& r    rY   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteFusionBlock   zT
Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
c                 D   > [         TU ]  5         [        XUSSS9U l        g )Nr   )rB   rC   )rE   rF   r<   rG   )r4   r%   r?   r@   rL   s       r!   rF   VitMatteFusionBlock.__init__   s"    (lST^_`	r    c                     [         R                  R                  USSSS9n[        R                  " X#/SS9nU R                  U5      nU$ )NrT   bilinearF)scale_factormodealign_cornersr   )dim)r   
functionalinterpolater   catrG   )r4   featuresdetailed_feature_mapupscaled_featuresouts        r!   rR   VitMatteFusionBlock.forward   sH    MM55hQU_ot5uii-AqIiin
r    )rG   rU   rW   s   @r!   rs   rs      s    a r    rs   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteHead   zB
Simple Matting Head, containing only conv3x3 and conv1x1 layers.
c                 &  > [         TU ]  5         UR                  S   nSn[        R                  " [        R
                  " X#SSSS9[        R                  " U5      [        R                  " S5      [        R
                  " USSSSS95      U l        g )N   r   r   )rA   rB   rC   Tr   )	rE   rF   fusion_hidden_sizesr   
Sequentialr,   r-   rJ   matting_convs)r4   r%   r?   mid_channelsrL   s       r!   rF   VitMatteHead.__init__   sr    004]]IIkQqRSTNN<(GGDMIIlA1QJ	
r    c                 (    U R                  U5      nU$ rO   r   rP   s     r!   rR   VitMatteHead.forward   s    )),7r    r   rU   rW   s   @r!   r   r      s    
 r    r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitMatteDetailCaptureModule   z?
Simple and lightweight Detail Capture Module for ViT Matting.
c           
        > [         TU ]  5         [        UR                  5      [        UR                  5      S-   :w  a  [        S5      eXl        [        U5      U l        U R                  R                  U l	        [        R                  " 5       U l        UR                  /UR                  -   U l        [        [        U R                  5      S-
  5       HX  nU R                  R!                  [#        UU R                  U   U R                  US-   *    -   U R                  US-      S95        MZ     [%        U5      U l        g )Nr   z_The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1.)r%   r?   r@   )rE   rF   rd   r   r_   
ValueErrorr%   rY   
convstreamrb   r   r`   fusion_blockshidden_sizefusion_channelsrc   re   rs   r   matting_head)r4   r%   rf   rL   s      r!   rF   $VitMatteDetailCaptureModule.__init__   s   v))*c&2P2P.QTU.UUq  ,V4//44]]_ & 2 23f6P6PPs4//0145A%%#! $ 4 4Q 7$//APQE(:S S!%!5!5a!e!< 6 )0r    c                 :   U R                  U5      n[        [        U R                  5      5       HB  nS[	        [        U R                  5      U-
  S-
  5      -   nU R                  U   " XU   5      nMD     [
        R                  " U R                  U5      5      nU$ )Nrl   r   )r   rc   rd   r   rm   r   sigmoidr   )r4   r   r&   detail_featuresrf   detailed_feature_map_namer   s          r!   rR   #VitMatteDetailCaptureModule.forward   s    //,7s4--./A(?#c$J\J\F]`aFadeFeBf(f%))!,XG`7abH 0 t00:;r    )r%   rb   r   r   r   r   rU   rW   s   @r!   r   r      s    12 r    r   zX
    ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
    c                      ^  \ rS rSrU 4S jr\     S
S\\R                     S\\	   S\\	   S\\R                     S\\	   4
S jj5       r
S	rU =r$ )VitMatteForImageMatting   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g rO   )rE   rF   r%   r
   backboner   decoder	post_init)r4   r%   rL   s     r!   rF    VitMatteForImageMatting.__init__   s9     %f-26: 	r    r&   output_attentionsoutput_hidden_stateslabelsreturn_dictc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnUb  [	        S5      eU R
                  R                  XUS9nUR                  S   nU R                  X5      n	U(       d  U	4USS -   n
Ub  U4U
-   $ U
$ [        UU	UR                  UR                  S9$ )ap  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth image matting for computing the loss.

Examples:

```python
>>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
>>> import torch
>>> from PIL import Image
>>> from huggingface_hub import hf_hub_download

>>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
>>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

>>> filepath = hf_hub_download(
...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
... )
>>> image = Image.open(filepath).convert("RGB")
>>> filepath = hf_hub_download(
...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
... )
>>> trimap = Image.open(filepath).convert("L")

>>> # prepare image + trimap for the model
>>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

>>> with torch.no_grad():
...     alphas = model(**inputs).alphas
>>> print(alphas.shape)
torch.Size([1, 1, 640, 960])
```NzTraining is not yet supported)r   r   r   r   )r   r   r   r   )r%   use_return_dictr   r   NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsr   r   r   r   )r4   r&   r   r   r   r   r   outputsr   r   outputs              r!   rR   VitMatteForImageMatting.forward   s    R &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq%&EFF--<<Wh = 
 ''+h5Y,F)-)9TGf$EvE!!//))	
 	
r    )r   r%   r   )NNNNN)r   r   r   r   rF   r	   r   r   TensorboolrR   r   rV   rW   s   @r!   r   r      s      04,0/3)-&*B
u||,B
 $D>B
 'tn	B

 &B
 d^B
 B
r    r   )r   dataclassesr   typingr   r   r   modeling_utilsr   utilsr   r	   utils.backbone_utilsr
   configuration_vitmatter   r   r#   r:   r<   rY   rs   r   r   r   __all__r   r    r!   <module>r      s     !    - 0 1 2 
: : :$ 
)o 
) 
)299 4   F")) "299 0&")) &R 
N
5 N

N
b %&?
@r    