
    3j׮                     8   S r SSKrSSKJr  SSKJr  SSKJr  SSKrSSKJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJr  SSKJr  SSKJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,  \#RZ                  " \.5      r/\"" SS9\ " S S\ 5      5       5       r0\"" SS9\ " S S\ 5      5       5       r1\"\ " S S\ 5      5       5       r2S\Rf                  S\Rf                  4S  jr4S!\Rf                  S\Rf                  4S" jr5S#\,S$\64S% jr7S^S&\6\8-  S'\94S( jjr: " S) S*\	Rv                  5      r< " S+ S,\	Rz                  5      r> " S- S.\	Rv                  5      r? " S/ S0\	Rv                  5      r@ " S1 S2\	Rv                  5      rA " S3 S4\	Rv                  5      rB " S5 S6\	Rv                  5      rC " S7 S8\	Rv                  5      rD " S9 S:\	Rv                  5      rE S_S;\	Rv                  S<\Rf                  S=\Rf                  S>\Rf                  S?\Rf                  S-  S@\FSA\F4SB jjrG " SC SD\	Rv                  5      rH " SE SF\	Rv                  5      rI " SG SH\	Rv                  5      rJ " SI SJ\	Rv                  5      rK " SK SL\	Rv                  5      rL " SM SN\5      rM " SO SP\	Rv                  5      rN " SQ SR\	Rv                  5      rO\" " SS ST\5      5       rP\"" SUS9 " SV SW\P5      5       rQ\"" SXS9 " SY SZ\P5      5       rR\" " S[ S\\P5      5       rS/ S]QrTg)`zPyTorch ALIGN model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithNoAttentionBaseModelOutputWithPooling(BaseModelOutputWithPoolingAndNoAttention)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Srg)AlignVisionModelOutput.   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_statehidden_states )__name__
__module____qualname____firstlineno____doc__r#   torchFloatTensor__annotations__r$   r%   tuple__static_attributes__r&       b/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/align/modeling_align.pyr!   r!   .   sN    
 .2L%##d*126u((4/659M5**+d29r1   r!   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	AlignTextModelOutput?   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedsr$   r%   
attentionsr&   )r'   r(   r)   r*   r+   r6   r,   r-   r.   r$   r%   r/   r7   r0   r&   r1   r2   r4   r4   ?   sh    
 -1K""T)026u((4/659M5**+d2926Je''(4/6r1   r4   c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\S	'   Sr\\S
'   S\\   4S jrSrg)AlignOutputQ   a.  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The output of [`AlignVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`AlignTextModel`].
vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
    The output of the [`AlignVisionModel`].
Nlosslogits_per_imagelogits_per_textr6   r#   text_model_outputvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r>   r?   N)getattrto_tuple).0kselfs     r2   	<genexpr>'AlignOutput.to_tuple.<locals>.<genexpr>p   s<      
   LLDGRYZ^`aRbRkRkRmm s   25)r/   keysrG   s   `r2   rD   AlignOutput.to_tupleo   s#     
YY[
 
 	
r1   r&   )r'   r(   r)   r*   r+   r;   r,   r-   r.   r<   r=   r6   r#   r>   r   r?   r   r/   r   rD   r0   r&   r1   r2   r9   r9   Q   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*14818DHAH
%* 
r1   r9   logitsr@   c                     [         R                  R                  U [        R                  " [        U 5      U R                  S9SS9$ )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr,   arangelenrP   )rM   s    r2   contrastive_lossrV   x   s5    ==&&vu||CKPVP]P]/^ps&ttr1   
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)rV   t)rW   caption_loss
image_losss      r2   
align_lossr\   |   s*    #J/L!*,,.1J%,,r1   confignum_channelsc                     U R                   nXR                  -  n[        U[        XS-  -   5      U-  U-  5      nUSU-  :  a  X2-  n[        U5      $ )z4
Round number of filters based on depth multiplier.
   g?)depth_divisorwidth_coefficientmaxint)r]   r^   divisornew_dims       r2   round_filtersrg      s`     ""G,,,L'3|k9:gEOPG |##w<r1   kernel_sizeadjustc                     [        U [        5      (       a  X 4n U S   S-  U S   S-  4nU(       a  US   S-
  US   US   S-
  US   4$ US   US   US   US   4$ )a.  
Utility function to get the tuple padding value for the depthwise convolution.

Args:
    kernel_size (`int` or `tuple`):
        Kernel size of the convolution layers.
    adjust (`bool`, *optional*, defaults to `True`):
        Adjusts padding value to apply to right and bottom sides of the input.
r   r`   r   )
isinstancerd   )rh   ri   corrects      r2   correct_padrm      s~     +s##"01~"KNa$78G
Q
GAJNGAJGG
GAJ
GAJ??r1   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	AlignVisionEmbeddings   zD
A module that corresponds to the stem module of the original work.
r]   c           	      |  > [         TU ]  5         [        US5      U l        [        R
                  " SS9U l        [        R                  " UR                  U R                  SSSSS9U l	        [        R                  " U R                  UR                  UR                  S	9U l        [        UR                     U l        g )
N    )r   r   r   r   paddingr   r`   validFrh   stridert   bias)epsmomentum)super__init__rg   out_dimr   	ZeroPad2drt   Conv2dr^   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr	   
hidden_act
activationrG   r]   	__class__s     r2   r|   AlignVisionEmbeddings.__init__   s    $VR0||L9991QPW^c
 &:O:OZ`ZtZtu !2!23r1   pixel_valuesr@   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ N)rt   r   r   r   )rG   r   featuress      r2   forwardAlignVisionEmbeddings.forward   sA    <<-##H->>(+??8,r1   )r   r   r   r}   rt   )r'   r(   r)   r*   r+   r   r|   r,   Tensorr   r0   __classcell__r   s   @r2   ro   ro      s5    	40 	4ELL U\\  r1   ro   c                   :   ^  \ rS rSr       SU 4S jjrSrU =r$ )AlignVisionDepthwiseConv2d   c	                 8   > X-  n	[         T
U ]  UU	UUUUUUUS9	  g )N)	in_channelsout_channelsrh   rw   rt   dilationgroupsrx   padding_mode)r{   r|   )rG   r   depth_multiplierrh   rw   rt   r   rx   r   r   r   s             r2   r|   #AlignVisionDepthwiseConv2d.__init__   s:     #5#%#% 	 
	
r1   r&   )r   r   r   r   r   Tzeros)r'   r(   r)   r*   r|   r0   r   r   s   @r2   r   r      s$     
 
r1   r   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jrSrU =r$ )AlignVisionExpansionLayer   zW
This corresponds to the expansion phase of each block in the original implementation.
r]   in_dimr}   rw   c                    > [         TU ]  5         [        R                  " UUSSSS9U l        [        R
                  " X1R                  S9U l        [        UR                     U l
        g )Nr   sameFr   r   rh   rt   rx   )num_featuresry   )r{   r|   r   r   expand_convr   r   	expand_bnr	   r   
expand_act)rG   r]   r   r}   rw   r   s        r2   r|   "AlignVisionExpansionLayer.__init__   sX    99 
 WBWBWX !2!23r1   r%   r@   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   rG   r%   s     r2   r   !AlignVisionExpansionLayer.forward   s4    ((7}56r1   )r   r   r   )r'   r(   r)   r*   r+   r   rd   r|   r,   r-   r   r   r0   r   r   s   @r2   r   r      sM    
40 
4# 
4 
4UX 
4U%6%6 5<<  r1   r   c            
       ~   ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjrS	\	R                  S
\	R                  4S jrSrU =r$ )AlignVisionDepthwiseLayer   zc
This corresponds to the depthwise convolution phase of each block in the original implementation.
r]   r   rw   rh   adjust_paddingc                 F  > [         TU ]  5         X0l        U R                  S:X  a  SOSn[        XES9n[        R
                  " US9U l        [        X$X6SS9U l        [        R                  " X!R                  UR                  S9U l        [        UR                     U l        g )	Nr`   ru   r   )ri   rs   Frv   r   ry   rz   )r{   r|   rw   rm   r   r~   depthwise_conv_padr   depthwise_convr   r   r   depthwise_normr	   r   depthwise_act)	rG   r]   r   rw   rh   r   conv_padrt   r   s	           r2   r|   "AlignVisionDepthwiseLayer.__init__   s     	"kkQ.7FkA"$,,w"?8FSX
 !nn%:%:VE_E_
 $F$5$56r1   r%   r@   c                     U R                   S:X  a  U R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ )Nr`   )rw   r   r   r   r   r   s     r2   r   !AlignVisionDepthwiseLayer.forward  sT    ;;! 33MBM++M:++M:**=9r1   )r   r   r   r   rw   r'   r(   r)   r*   r+   r   rd   boolr|   r,   r-   r   r   r0   r   r   s   @r2   r   r      s_    7!7 7 	7
 7 7,	U%6%6 	5<< 	 	r1   r   c            	       ~   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\	R                  S	\	R                  4S
 jrSrU =r$ )AlignVisionSqueezeExciteLayeri   zd
This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
r]   r   
expand_dimexpandc                   > [         TU ]  5         U(       a  UOUU l        [        S[	        X!R
                  -  5      5      U l        [        R                  " SS9U l	        [        R                  " U R                  U R                  SSS9U l        [        R                  " U R                  U R                  SSS9U l        [        UR                     U l        [        R                   " 5       U l        g )Nr   )output_sizer   )r   r   rh   rt   )r{   r|   dimrc   rd   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezer   reducer   r	   r   
act_reduceSigmoid
act_expand)rG   r]   r   r   r   r   s        r2   r|   &AlignVisionSqueezeExciteLayer.__init__%  s    !':V!S*H*H!HIJ++:ii	
 ii	
 !!2!23**,r1   r%   r@   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      n[
        R                  " X!5      nU$ r   )r   r   r   r   r   r,   mul)rG   r%   inputss      r2   r   %AlignVisionSqueezeExciteLayer.forward:  sa    ]3M26M26		&8r1   )r   r   r   r   r   r   r   )Fr   r   s   @r2   r   r      sR    '0 '# '3 'X\ ' '*
U%6%6 
5<< 
 
r1   r   c                      ^  \ rS rSrSrS\S\S\S\S\S\4U 4S	 jjr	S
\
R                  S\
R                  S\
R                  4S jrSrU =r$ )AlignVisionFinalBlockLayeriG  zS
This corresponds to the final phase of each block in the original implementation.
r]   r   r}   rw   	drop_rateid_skipc                   > [         TU ]  5         US:H  =(       a    U(       + U l        [        R                  " UUSSSS9U l        [        R                  " X1R                  UR                  S9U l	        [        R                  " US9U l        g )Nr   r   Fr   r   )p)r{   r|   apply_dropoutr   r   project_convr   r   r   
project_bnDropoutdropout)rG   r]   r   r}   rw   r   r   r   s          r2   r|   #AlignVisionFinalBlockLayer.__init__L  sx     	#q[8[II 
 .. &;&;fF`F`
 zzI.r1   
embeddingsr%   r@   c                     U R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX!-   nU$ r   )r   r   r   r   )rG   r   r%   s      r2   r   "AlignVisionFinalBlockLayer.forward]  sE    ))-86 LL7M)6Mr1   )r   r   r   r   r'   r(   r)   r*   r+   r   rd   floatr   r|   r,   r-   r   r   r0   r   r   s   @r2   r   r   G  so    /'/14/?B/LO/\a/lp/"%"3"3 EDUDU Z_ZfZf  r1   r   c                      ^  \ rS rSrSrS\S\S\S\S\S\S	\S
\S\4U 4S jjr	S\
R                  S\
R                  4S jrSrU =r$ )AlignVisionBlockih  a1  
This corresponds to the block module of original the EfficientNet vision encoder implementation.

Args:
    config ([`AlignVisionConfig`]):
        Model configuration class.
    in_dim (`int`):
        Number of input channels.
    out_dim (`int`):
        Number of output channels.
    stride (`int`):
        Stride size to be used in convolution layers.
    expand_ratio (`int`):
        Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
    kernel_size (`int`):
        Kernel size for the depthwise convolution layer.
    drop_rate (`float`):
        Dropout rate to be used in the final phase of each block.
    id_skip (`bool`):
        Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
        of each block. Set to `True` for the first block of each stage.
    adjust_padding (`bool`):
        Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
        operation, set to `True` for inputs with odd input sizes.
r]   r   r}   rw   expand_ratiorh   r   r   r   c
           	      f  > [         TU ]  5         XPl        U R                  S:g  U l        X%-  n
U R                  (       a  [	        XXS9U l        [        UU R                  (       a  U
OUUUU	S9U l        [        XXR                  S9U l	        [        UU R                  (       a  U
OUUUUUS9U l        g )Nr   )r]   r   r}   rw   )r]   r   rw   rh   r   )r]   r   r   r   )r]   r   r}   rw   r   r   )r{   r|   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)rG   r]   r   r}   rw   r   rh   r   r   r   expand_in_dimr   s              r2   r|   AlignVisionBlock.__init__  s     	(''1,-;;6mDN 8$(KK=V#)
 <];;
 5$(KK=V
r1   r%   r@   c                     UnU R                   S:w  a  U R                  U5      nU R                  U5      nU R                  U5      nU R	                  X!5      nU$ Nr   )r   r   r   r   r   )rG   r%   r   s      r2   r   AlignVisionBlock.forward  sY    "
! NN=9M++M: ++M:
Br1   )r   r   r   r   r   r   r   r   s   @r2   r   r   h  s    4'
!'
 '
 	'

 '
 '
 '
 '
 '
 '
R
U%6%6 
5<< 
 
r1   r   c                   d   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	\
   S\4S jrS	rU =r$ )
AlignVisionEncoderi  z
Forward propagates the embeddings through each vision encoder (EfficientNet) block.

Args:
    config ([`AlignVisionConfig`]):
        Model configuration class.
r]   c                   >^ ^ [         TT ]  5         UR                  T l        U 4S jm[        UR                  5      n[        U4S jUR                   5       5      nSn/ n[        U5       H  n[        XR                  U   5      n[        XR                  U   5      nUR                  U   n	UR                  U   n
UR                  U   n[        T" UR                  U   5      5       Hc  nUS:H  nUS:  a  SOU	n	US:  a  UOUnXAR                  ;  nUR                  U-  U-  n[        UUUU	U
UUUUS9	nUR!                  U5        US-  nMe     M     ["        R$                  " U5      T l        g )Nc                 \   > [        [        R                  " TR                  U -  5      5      $ r   )rd   mathceildepth_coefficient)repeatsrG   s    r2   round_repeats2AlignVisionEncoder.__init__.<locals>.round_repeats  s"    tyy!7!7'!ABCCr1   c              3   4   >#    U  H  nT" U5      v   M     g 7fr   r&   )rE   nr   s     r2   rH   .AlignVisionEncoder.__init__.<locals>.<genexpr>  s     L3Kaq))3Ks   r   r   )	r]   r   r}   rw   rh   r   r   r   r   )r{   r|   r   rU   r   sumnum_block_repeatsrangerg   r   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)rG   r]   num_base_blocks
num_blockscurr_block_numr  ir   r}   rw   rh   r   jr   r   r   blockr   r   s   `                @r2   r|   AlignVisionEncoder.__init__  sp   !'!9!9	D f001L63K3KLL
'A"6+=+=a+@AF#F,?,?,BCG^^A&F --a0K!//2L=)A)A!)DEFq&!e$%Ev!/7O7O!O"44~E
R	(!!#! +!-'##1
 e$!#' G (8 mmF+r1   r%   kwargsr@   c                 J    U R                    H  nU" U5      nM     [        US9$ N)r$   )r  r   )rG   r%   r  r  s       r2   r   AlignVisionEncoder.forward  s.    
 [[E!-0M ! .+
 	
r1   )r  r   )r'   r(   r)   r*   r+   r   r|   r,   r-   r   r   r   r   r0   r   r   s   @r2   r   r     sH    ),0 ),V

((

 +,

 
(	

 

r1   r   c                      ^  \ rS rSrSrU 4S jr    SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  4
S
 jjr
SrU =r$ )AlignTextEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c                 
  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  U R#                  S[$        R*                  " U R,                  R/                  5       [$        R0                  S9SS9  g )	N)padding_idxry   position_idsr   F)
persistenttoken_type_ids)dtype)r{   r|   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   register_bufferr,   rT   r   r   r  sizelongr   s     r2   r|   AlignTextEmbeddings.__init__  s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r1   N	input_idsr  r  inputs_embedsr@   c                 @   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr  r   r  r   )r  rP   )r*  r  hasattrr  r   r,   r   r+  rP   r!  r%  r#  r&  r   )rG   r-  r  r  r.  input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr%  r   r#  s               r2   r   AlignTextEmbeddings.forward  s.     #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
"66|D)
^^J/
\\*-
r1   )r&  r   r#  r%  r!  )NNNN)r'   r(   r)   r*   r+   r|   r,   
LongTensorr-   r   r   r0   r   r   s   @r2   r  r    s    Q
$ .2260426&##d*& ((4/& &&-	&
 ((4/& 
& &r1   r  modulequerykeyvalueattention_maskscalingr   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr`   r   r  )r   r  )r   trainingr   )r,   matmul	transposer   rR   softmaxfloat32tor  r   r>  
contiguous)
r7  r8  r9  r:  r;  r<  r   r  attn_weightsattn_outputs
             r2   eager_attention_forwardrG  6  s     <<}}Q':;gEL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r1   c                      ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\
\R                  \R                  S-  4   4S jjrS	rU =r$ )AlignTextSelfAttentioniL  c                 6  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        UR                   U l        U R                  S-  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )r{   r|   r  num_attention_headsr0  
ValueErrorr]   rd   attention_head_sizeall_head_sizer   Linearr8  r9  r:  r   attention_probs_dropout_probr   attention_dropoutr<  r   s     r2   r|   AlignTextSelfAttention.__init__M  sD    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r1   Nr%   r;  r  r@   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  (       d  SOU R                  U R                  S.UD6u  pU
R                  " / UQSP76 R!                  5       n
X4$ )Nr  r   r`           )r   r<  )shaperO  r8  viewr@  r9  r:  r   get_interfacer]   _attn_implementationrG  r>  rS  r<  reshaperD  )rG   r%   r;  r  r1  hidden_shapequery_states
key_statesvalue_statesattention_interfacerF  rE  s               r2   r   AlignTextSelfAttention.forwardb  s8    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFH((r1   )
rP  rS  rO  r]   r   r9  rM  r8  r<  r:  r   )r'   r(   r)   r*   r|   r,   r   r-   r   r   r/   r   r0   r   r   s   @r2   rI  rI  L  si    60 48)||) ))D0) +,	)
 
u||U\\D00	1) )r1   rI  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AlignTextSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr  )r{   r|   r   rQ  r  denser&  r'  r   r(  r   r   s     r2   r|   AlignTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r1   r%   input_tensorr@   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rf  r   r&  rG   r%   rh  s      r2   r   AlignTextSelfOutput.forward  5    

=1]3}'CDr1   r&  rf  r   
r'   r(   r)   r*   r|   r,   r   r   r0   r   r   s   @r2   rc  rc    6    >U\\  RWR^R^  r1   rc  c            	          ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\R                  4S jjr
S	rU =r$ )AlignTextAttentioni  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r   )r{   r|   rI  rG   rc  outputr   s     r2   r|   AlignTextAttention.__init__  s&    *62	)&1r1   Nr%   r;  r  r@   c                 Z    UnU R                   " U4SU0UD6u  pU R                  X5      nU$ Nr;  )rG   rt  )rG   r%   r;  r  residual_s         r2   r   AlignTextAttention.forward  sE     !99
)
 

 M<r1   )rt  rG   r   )r'   r(   r)   r*   r|   r,   r   r-   r   r   r   r0   r   r   s   @r2   rr  rr    sV    2 48|| ))D0 +,	
 
 r1   rr  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AlignTextIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r{   r|   r   rQ  r  intermediate_sizerf  rk   r   strr	   intermediate_act_fnr   s     r2   r|   AlignTextIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r1   r%   r@   c                 J    U R                  U5      nU R                  U5      nU$ r   rf  r  r   s     r2   r   AlignTextIntermediate.forward  s&    

=100?r1   r  ro  r   s   @r2   r|  r|    s(    9U\\ ell  r1   r|  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AlignTextOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g re  )r{   r|   r   rQ  r~  r  rf  r&  r'  r   r(  r   r   s     r2   r|   AlignTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r1   r%   rh  r@   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rj  rk  s      r2   r   AlignTextOutput.forward  rm  r1   rn  ro  r   s   @r2   r  r    rp  r1   r  c            	          ^  \ rS rSrU 4S jr SS\R                  S\R                  S-  S\\	   S\R                  4S jjr
S	 rS
rU =r$ )AlignTextLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g r   )
r{   r|   chunk_size_feed_forwardseq_len_dimrr  	attentionr|  intermediater  rt  r   s     r2   r|   AlignTextLayer.__init__  sI    '-'E'E$+F31&9%f-r1   Nr%   r;  r  r@   c                     U R                   " U4SU0UD6n[        U R                  U R                  U R                  U5      nU$ rw  )r  r   feed_forward_chunkr  r  )rG   r%   r;  r  s       r2   r   AlignTextLayer.forward  sW     
)
 
 2##T%A%A4CSCSUb
 r1   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  rt  )rG   attention_outputintermediate_outputlayer_outputs       r2   r  !AlignTextLayer.feed_forward_chunk  s)    "//0@A{{#6Ir1   )r  r  r  rt  r  r   )r'   r(   r)   r*   r|   r,   r   r-   r   r   r   r  r0   r   r   s   @r2   r  r    s[    . 48|| ))D0 +,	
 
$ r1   r  c            	       |   ^  \ rS rSrU 4S jr S
S\R                  S\R                  S-  S\\	   S\
4S jjrS	rU =r$ )AlignTextEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r{   r|   r]   r   r  r   num_hidden_layersr  layergradient_checkpointing)rG   r]   r
  r   s      r2   r|   AlignTextEncoder.__init__  sR    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A&Nr%   r;  r  r@   c                 N    U R                    H  nU" UU40 UD6nM     [        US9$ r  )r  r   )rG   r%   r;  r  layer_modules        r2   r   AlignTextEncoder.forward  s>     !JJL( M ' +
 	
r1   )r]   r  r  r   )r'   r(   r)   r*   r|   r,   r   r-   r   r   r   r   r0   r   r   s   @r2   r  r    sR    , 48
||
 ))D0
 +,	

 

 
r1   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AlignTextPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r{   r|   r   rQ  r  rf  Tanhr   r   s     r2   r|   AlignTextPooler.__init__  s9    YYv1163E3EF
'')r1   r%   r@   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )rf  r   )rG   r%   first_token_tensorpooled_outputs       r2   r   AlignTextPooler.forward  s6     +1a40

#566r1   )r   rf  ro  r   s   @r2   r  r    s(    $
U\\ ell  r1   r  c                   v    \ rS rSr% \\S'   SrSrSr\	R                  " 5       S\R                  4S j5       rSrg	)
AlignPreTrainedModeli  r]   align)imagetextTr7  c                 t   U R                   R                  n[        U[        R                  [        R
                  45      (       aO  [        R                  " UR                  SUS9  UR                  b   [        R                  " UR                  5        GO4[        U[        5      (       a  [        R                  " UR                  R                  5        [        R                  " UR                  R                  5        [        R                  " UR                  U R                   R                   5        O[        U[        R"                  5      (       av  [        R                  " UR                  SUS9  UR$                  bI  ['        UR                  SS5      (       d-  [        R                  " UR                  UR$                     5        [        U[        R(                  [        R*                  45      (       a  [        R                  " UR                  5        [        R,                  " UR                  5        ['        USS5      ba  [        R                  " UR.                  5        [        R,                  " UR0                  5        [        R                  " UR2                  5        gg[        U[4        5      (       a|  [        R6                  " UR8                  [:        R<                  " UR8                  R>                  S   5      RA                  S5      5        [        R                  " URB                  5        gg)	zInitialize the weightsrV  )meanstdN_is_hf_initializedFrunning_meanr  r  )"r]   initializer_rangerk   r   rQ  r   initnormal_weightrx   zeros_
AlignModelxavier_uniform_text_projection	constant_temperaturetemperature_init_valuer  r  rC   r&  r   ones_r  running_varnum_batches_trackedr  copy_r  r,   rT   rW  r   r  )rG   r7  r  s      r2   _init_weights"AlignPreTrainedModel._init_weights  s     kk++fryy"))455LLSc:{{&FKK(
++  !7!7!>!>?KK..334NN6--t{{/Q/QR--LLSc:!!-gfmmMach6i6iFMM&*<*<=>fr||R^^<==KK$JJv}}%v~t4@F//0

6--.F667 A  344JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 5r1   r&   N)r'   r(   r)   r*   r   r.   base_model_prefixinput_modalitiessupports_gradient_checkpointingr,   no_gradr   Moduler  r0   r&   r1   r2   r  r    s=    (&*#
]]_/BII / /r1   r  zJ
    The text model from ALIGN without any head or projection on top.
    c                   D  ^  \ rS rSr% \\S'   SrS/r\\	S.r
SS\S\4U 4S jjjrS rS	 r\\\     SS\R&                  S
-  S\R&                  S
-  S\R&                  S
-  S\R&                  S
-  S\R&                  S
-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )AlignTextModeli4  r]   )r  r  )r%   r7   add_pooling_layerc                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
r{   r|   r]   r  r   r  encoderr  pooler	post_init)rG   r]   r  r   s      r2   r|   AlignTextModel.__init__B  sK    
 	 -f5'/1Bof- 	r1   c                 .    U R                   R                  $ r   r   r!  rK   s    r2   get_input_embeddings#AlignTextModel.get_input_embeddingsR  s    ...r1   c                 $    XR                   l        g r   r  )rG   r:  s     r2   set_input_embeddings#AlignTextModel.set_input_embeddingsU  s    */'r1   Nr-  r;  r  r  r.  r  r@   c                    Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[        S5      eUu  pUb  UR                  OUR                  n
Uc  [        R
                  " X4U
S9nU R                  UUUUS9n[        U R                  UUS9nU R                  " U4SU0UD6nUS	   nU R                  b  U R                  U5      OSn[        UUS
9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, AlignTextModel

>>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
>>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer  z5You have to specify either input_ids or inputs_embedsrO   )r-  r  r  r.  )r]   r.  r;  r;  r   r$   pooler_output)rN  %warn_if_padding_and_no_attention_maskr*  rP   r,   onesr   r
   r]   r  r  r   )rG   r-  r;  r  r  r.  r  r1  
batch_sizer2  rP   embedding_outputencoder_outputssequence_outputr  s                  r2   r   AlignTextModel.forwardX  s2   6  ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN??%)'	 + 
 3;;*)
 ,,
)
 

 *!,8<8OO4UY)-'
 	
r1   r]   r   r  r  TNNNNN)r'   r(   r)   r*   r   r.   r  _no_split_modulesr  rI  _can_record_outputsr   r|   r  r  r   r   r   r,   r   r   r   r/   r   r   r0   r   r   s   @r2   r  r  4  s     ./',
 4   /0   *..2.2,0-1@
<<$&@
 t+@
 t+	@

 llT)@
 ||d*@
 +,@
 
+	+@
    @
r1   r  zL
    The vision model from ALIGN without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSrSrSr	S/r
S\0rS\4U 4S	 jjr\\\ SS\R$                  S
-  S\\   S\\-  4S jj5       5       5       rSrU =r$ )AlignVisionModeli  r]   r   )r  Fr   r   r%   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:X  a%  [        R                  " UR                  SS9U l        OMUR                  S:X  a%  [        R                  " UR                  SS9U l        O[        SUR                   35      eU R                  5         g )Nr  T)	ceil_moderc   z2config.pooling must be one of ['mean', 'max'] got )r{   r|   r]   ro   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2drN  poolingr  r   s     r2   r|   AlignVisionModel.__init__  s     /7)&1 &(,,v'8'8DIDK  E),,v'8'8DIDKQRXR`R`Qabcc 	r1   Nr  r@   c                     Uc  [        S5      eU R                  U5      nU R                  " U40 UD6nUS   nU R                  U5      nUR	                  UR
                  SS 5      n[        UUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, AlignVisionModel

>>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```Nz You have to specify pixel_valuesr   r`   r  )rN  r   r  r  r[  rW  r   )rG   r   r  r  r  r$   r  s          r2   r   AlignVisionModel.forward  s    < ?@@??<8,,

 ,A.$56%--m.A.A"1.EF7/'
 	
r1   r  r   )r'   r(   r)   r*   r   r.   main_input_namer  r  _input_embed_layerr  r   r  r|   r   r   r   r,   r-   r   r   r/   r   r   r0   r   r   s   @r2   r  r    s     $O!&+#&+,)0 "   26*
''$.*
 +,*
 
9	9	*
    *
r1   r  c                   ^  ^  \ rS rSr% \\S'   S\4U 4S jjr\\     SS\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\\   S\\-  4S jj5       5       r\\S\	R                   S
\\   S\\-  4S j5       5       r\\       SS\	R$                  S-  S\	R                   S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S	\	R                  S-  S\S-  S
\\   S\\-  4S jj5       5       rSrU =r$ )r  i  r]   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l        [        U5      U l        [        U5      U l        [         R"                  " U R                  U R                  5      U l        [         R&                  " [(        R*                  " U R,                  R.                  5      5      U l        U R3                  5         g )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )r{   r|   rk   text_configr   	TypeErrortypevision_configr   projection_dimr  text_embed_dimr  
text_modelr  vision_modelr   rQ  r  	Parameterr,   tensorr]   r  r  r  )rG   r]   r  r   r   s       r2   r|   AlignModel.__init__  s)    &,,o>>++,-Q0 
 &..0ABB--./q2 
 ((,,$33)55(5,];!yy)<)<d>Q>QR<<T[[5W5W(XY 	r1   Nr-  r;  r  r  r.  r  r@   c           	          U R                   " SUUUUUS.UD6nUS   SS2SSS24   nU R                  U5      Ul        U$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoTokenizer, AlignModel

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```r-  r;  r  r  r.  r   Nr&   )r  r  r  )	rG   r-  r;  r  r  r.  r  text_outputsr$   s	            r2   get_text_featuresAlignModel.get_text_features  sa    2 48?? 4
))%'4
 4
 )OAq!G4%)%9%9:K%L"r1   r   c                 *    U R                   " SSU0UD6$ )a  
Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AlignModel
>>> from transformers.image_utils import load_image

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```r   r&   )r  )rG   r   r  s      r2   get_image_featuresAlignModel.get_image_features7  s    .   ElEfEEr1   return_lossc           
         U R                   " SSU0UD6n	U R                  " SUUUUUS.UD6n
U	S   nU
S   SS2SSS24   nU R                  U5      nXR                  SSSS	9-  nXR                  SSSS	9-  n[        R
                  " XR                  5       5      U R                  -  nUR                  5       nSnU(       a  [        U5      n[        UUUUUU
U	S
9$ )a^  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AlignModel
>>> from transformers.image_utils import load_image

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(
...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
... )

>>> with torch.inference_mode():
...     outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```r   r	  r   r   Nr`   r  T)r   r   keepdim)r;   r<   r=   r6   r#   r>   r?   r&   )
r  r  r  normr,   r?  rY   r  r\   r9   )rG   r-  r   r;  r  r  r.  r  r  vision_outputsr
  r#   r6   r=   r<   r;   s                   r2   r   AlignModel.forwardP  s   N ** 
%


  
))%'
 
 &a("1oaAg.**;7 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4DEHXHXX*,,.o.D-+#%* .
 	
r1   )r  r  r  r  r  r  r  )NNNNNNN)r'   r(   r)   r*   r   r.   r|   r   r   r,   r   r   r   r/   r   r  r-   r  r6  r   r9   r   r0   r   r   s   @r2   r  r    s   { <  *..2.2,0-1"<<$&" t+" t+	"
 llT)" ||d*" +," 
+	+"  "H F!--F9?@R9SF	+	+F  F.  .215.2.2,0-1#'K
##d*K
 ''$.K
 t+	K

 t+K
 llT)K
 ||d*K
 D[K
 +,K
 
	K
  K
r1   r  )r  r  r  r  r  )rV  )Ur+   r   collections.abcr   dataclassesr   typingr   r,   r    r   r  activationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_alignr   r   r   
get_loggerr'   loggerr!   r4   r9   r   rV   r\   rd   rg   r/   r   rm   r  ro   r   r   r   r   r   r   r   r   r  r   rG  rI  rc  rr  r|  r  r  r  r  r  r  r  r  __all__r&   r1   r2   <module>r(     s_     $ !    & ! 6 9  G & 6 M M I 5 P P 
		H	% 
 :[ : : 
 	7; 	7 	7 
 
+  
   
JuU\\ uell u-5<< -ELL -+ 3  @S5[ @$ @*BII 4
 
6		 6$		 $P$BII $N BNryy Nb>
 >
B9")) 9F %II%<<% 
% <<	%
 LL4'% % %,3)RYY 3)n"))  .BII  bii / B
ryy 
4bii   /?  /  /F 
b
) b

b
J 
I
+ I

I
X m
% m
 m
` Wr1   