
    3j@                     B   S SK r S SKJrJr  S SKJr  S SKrS SKrS SKJ	r	J
r
  SSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#  SSK$J%r%J&r&  SSK'J(r(J)r)J*r*  SSK+J,r,  SSK-J.r.  \&" SS9\ " S S\5      5       5       r/\&" SS9\ " S S\5      5       5       r0\&" SS9\ " S S\5      5       5       r1\&" SS9\ " S S \5      5       5       r2\&" S!S9\ " S" S#\5      5       5       r3 " S$ S%\
Rh                  5      r5\#" S&S'9S(\6S)\6S*\Rn                  S+\Rp                  S,\R                  4
S- j5       r9   SS.\R                  S/\:S-  S0\:S-  S1\:S-  S,\R                  4
S2 jjr; " S3 S4\
Rh                  5      r<\" S55       " S6 S7\
Rh                  5      5       r=S8 r>   SS9\
Rh                  S:\R                  S;\R                  S<\R                  S=\R                  S-  S>\:\6-  S?\:S-  S@\:S-  S,\?\R                  \R                  4   4SA jjr@SB\R                  SC\R                  SD\R                  SE\R                  S,\?\R                  \R                  4   4
SF jrASG\R                  SH\6S,\R                  4SI jrB " SJ SK\
Rh                  5      rC " SL SM\
Rh                  5      rD " SN SO\
Rh                  5      rE " SP SQ\
Rh                  5      rF " SR SS\
Rh                  5      rG " ST SU\5      rH " SV SW\
Rh                  5      rI " SX SY\
Rh                  5      rJ " SZ S[\
Rh                  5      rK " S\ S]\
Rh                  5      rL " S^ S_\
Rh                  5      rM\& " S` Sa\5      5       rN " Sb Sc\N5      rO\& " Sd Se\N5      5       rP\& " Sf Sg\\N5      5       rQ\&" ShSi9 " Sj Sk\N5      5       rRSSl jrS\&" SmSnSo9 " Sp Sq\N5      5       rT\&" SrSsSo9 " St Su\N5      5       rU\&" SvSwSo9 " Sx Sy\N5      5       rV\&" SzS{So9 " S| S}\N5      5       rW/ S~QrXg)    N)CallableIterable)	dataclass)Tensornn   )initialization)ACT2FN)BackboneMixinfilter_output_hidden_states)use_kernel_forward_from_hub)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingModelOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)#compile_compatible_method_lru_cache)TransformersKwargsauto_docstring)can_return_tuplemaybe_autocastmerge_with_config_defaults)capture_outputs   )Sapiens2Configz
    Output type of [`Sapiens2Backbone`], extending [`BackboneOutput`] with optional CLS tokens from
    each selected feature stage (used when `config.return_class_token=True`).
    )custom_introc                   H    \ rS rSr% SrSr\\R                     S-  \	S'   Sr
g)Sapiens2BackboneOutput0   z
cls_tokens (`tuple(torch.FloatTensor)`, *optional*):
    CLS token from each selected feature stage, each of shape `(batch_size, hidden_size)`.
    Only present when `config.return_class_token=True`.
N
cls_tokens )__name__
__module____qualname____firstlineno____doc__r$   tupletorchFloatTensor__annotations____static_attributes__r%       h/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/sapiens2/modeling_sapiens2.pyr"   r"   0   s#     37Je''(4/6r0   r"   z6
    Class for outputs of pose estimation models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
Sapiens2PoseEstimatorOutputD   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Pose estimation loss.
heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
    Heatmaps as predicted by the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
    (also called feature maps) of the model at the output of each stage.
Nlossheatmaps.hidden_states
attentionsr%   )r&   r'   r(   r)   r*   r5   r,   r-   r.   r6   r7   r+   r8   r/   r%   r0   r1   r3   r3   D   sq    	 &*D%

d
"))-He$&-:>M5**C/047>7;Je'',-4;r0   r3   z8
    Class for outputs of normal estimation models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
Sapiens2NormalEstimatorOutput\   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Normal estimation loss.
normals (`torch.FloatTensor` of shape `(batch_size, num_labels, height, width)`):
    Raw normal map predictions as output by the model (unnormalized).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage)
    of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
    each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one per layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`. Attentions weights after the attention softmax.
Nr5   normals.r7   r8   r%   )r&   r'   r(   r)   r*   r5   r,   r-   r.   r<   r7   r+   r8   r/   r%   r0   r1   r:   r:   \   sq     &*D%

d
")(,GU%,:>M5**C/047>7;Je'',-4;r0   r:   z:
    Class for outputs of pointmap estimation models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)Sapiens2PointmapEstimatorOutputw   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Pointmap estimation loss.
pointmaps (`torch.FloatTensor` of shape `(batch_size, 3, height, width)`):
    Per-pixel 3D XYZ coordinate predictions in canonical camera space.
scales (`torch.FloatTensor` of shape `(batch_size, 1)`, *optional*):
    Canonical focal length / actual focal length ratio. `None` when no scale branch is configured.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage)
    of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
    each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one per layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`. Attentions weights after the attention softmax.
Nr5   	pointmapsscales.r7   r8   r%   )r&   r'   r(   r)   r*   r5   r,   r-   r.   r@   rA   r7   r+   r8   r/   r%   r0   r1   r>   r>   w   s      &*D%

d
")*.Iu  4'.'+FE$+:>M5**C/047>7;Je'',-4;r0   r>   z4
    Class for outputs of image matting models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S'   S	rg)
Sapiens2ImageMattingOutput   a"  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Loss.
alphas (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`):
    Estimated alpha values.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
    (also called feature maps) of the model at the output of each stage.
foregrounds (`torch.FloatTensor` of shape `(batch_size, 3, height, width)`):
    Pre-multiplied RGB foreground predictions in `[0, 1]` (sigmoid-activated).
Nr5   alphasr7   r8   foregroundsr%   )r&   r'   r(   r)   r*   r5   r,   r-   r.   rE   r7   r+   r8   rF   r/   r%   r0   r1   rC   rC      s|     &*D%

d
")'+FE$+59M5**+d2926Je''(4/6,0K""T)0r0   rC   c                      ^  \ rS rSrSrS\4U 4S jjrSS\R                  S\R                  S-  S\R                  4S	 jjr	S
r
U =r$ )Sapiens2Embeddings   zE
Construct the CLS token, mask token, position and patch embeddings.
configc                 R  > [         TU ]  5         Xl        [        R                  " [
        R                  " SSUR                  5      5      U l        UR                  (       a6  [        R                  " [
        R                  " SSUR                  5      5      OS U l        [        R                  " [
        R                  " SUR                  UR                  5      5      U l        [        R                  " UR                   UR                  UR"                  UR"                  S9U l        g )Nr   )kernel_sizestride)super__init__rJ   r   	Parameterr,   randnhidden_size	cls_tokenuse_mask_tokenzeros
mask_tokenemptynum_register_tokensregister_tokensConv2dnum_channels
patch_sizepatch_embeddingsselfrJ   	__class__s     r1   rO   Sapiens2Embeddings.__init__   s    ekk!Q8J8J&KLQWQfQf",,u{{1a9K9K'LMlp!||EKK6;U;UW]WiWi,jk "		!3!3ARAR[a[l[l!
r0   Npixel_valuesbool_masked_posreturnc                 V   Ub  U R                   c  [        S5      eUR                  S   nU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nUbK  U R                   R                  UR
                  5      n[        R                  " UR                  S5      Xe5      nU R                  R                  USS5      nU R                  R                  USS5      n[        R                  " XxU/SS9n	U	$ )Nz:bool_masked_pos requires use_mask_token=True in the configr   dtype   r   dim)rV   
ValueErrorshaper]   weightrg   toflatten	transposer,   where	unsqueezerS   expandrY   cat)
r_   rb   rc   
batch_sizetarget_dtyper]   rV   rS   rY   
embeddingss
             r1   forwardSapiens2Embeddings.forward   s   &4??+BYZZ!''*
,,3399  00|1TU+33A6@@AF&++,<,B,BCJ${{?+D+DR+H*g NN))*b"=	..55j"bIYY	<LMSTU
r0   )rS   rJ   rV   r]   rY   N)r&   r'   r(   r)   r*   r   rO   r,   r   ry   r/   __classcell__r`   s   @r1   rH   rH      sH    
~ 
ELL 5<<RVCV bgbnbn  r0   rH       )maxsizenum_patches_hnum_patches_wrg   devicerd   c                     [         R                  " SXUS9n[         R                  " SXUS9nX@-  nXQ-  n[         R                  " [         R                  " XESS9SS9nUR	                  SS5      nS	U-  S
-
  nU$ )aI  
Computes the 2D coordinates of the centers of image patches, normalized to the range [-1, +1].
The center of each patch is exactly halfway between its top-left and bottom-right corners.

Args:
    num_patches_h (int): Number of patches along the vertical (height) axis.
    num_patches_w (int): Number of patches along the horizontal (width) axis.
    dtype (torch.dtype): The desired data type of the returned tensor.

Returns:
    torch.Tensor: A tensor of shape (height * width, 2), where each row contains the (y, x)
        coordinates of a patch center, normalized to [-1, +1].
g      ?rg   r   ij)indexingri   rj   r   r   g       @g      ?)r,   arangestackmeshgridrp   )r   r   rg   r   coords_hcoords_wcoordss          r1   get_patches_center_coordinatesr      sw    " ||CFKH||CFKH'H'H[[TJPRSF^^Aq!F6\CFMr0   r   shiftjitterrescalec                 .   UbA  [         R                  " SU R                  U R                  S9nUR	                  U* U5      nX-   n Ube  [
        R                  " U5      n[         R                  " SU R                  U R                  S9nUR	                  U* U5      R                  5       nX-  n Ube  [
        R                  " U5      n[         R                  " SU R                  U R                  S9nUR	                  U* U5      R                  5       nX-  n U $ )N)r   rh   )r   rg   r   )r,   rW   r   rg   uniform_nplogexp)	r   r   r   r   shift_hwjitter_range	jitter_hwrescale_range
rescale_hws	            r1   "augment_patches_center_coordinatesr      s     ;;vfmm6<<P$$eVU3" vvf~KKv}}FLLQ	&&}lCGGI	# w[[6==M
((-GKKM
$Mr0   c                      ^  \ rS rSr% \R
                  \S'   S\4U 4S jjrS\R
                  S\	\R
                  \R
                  4   4S jr
SrU =r$ )	Sapiens2RopePositionEmbeddingi  inv_freqrJ   c           	      .  > [         T	U ]  5         Xl        UR                  U l        UR
                  UR                  -  U l        SU R                  [        R                  " SSSU R                  -  [        R                  S9-  -  nU R                  SUSS9  UR                  n[        U[        5      (       a  UOX34u  pEUR                  n[        U[         5      (       a  UOUS   n[        U[         5      (       a  UOUS   nXG-  U l        XX-  U l        g )Nr   r      rf   r   F)
persistent)rN   rO   rJ   
rope_thetabaserR   num_attention_headshead_dimr,   r   float32register_buffer
image_size
isinstancer   r\   intr   r   )
r_   rJ   r   r   image_himage_wr\   patch_size_hpatch_size_wr`   s
            r1   rO   &Sapiens2RopePositionEmbedding.__init__  s    %%	**f.H.HHtyyELLAq4==7HPUP]P]$^^^ZeD&&
)3J)I)I:PZOg&&
%/
C%@%@zjQRm%/
C%@%@zjQRm$4$4r0   rb   rd   c                    UR                   u    p#nX0R                  R                  -  nX@R                  R                  -  nUR                  n[	        UR
                  [        5      (       a  UR
                  S:w  a  UR
                  OSn[        USS9   [        XV[        R                  US9n	U R                  (       aH  [        U	U R                  R                  U R                  R                  U R                  R                  S9n	S[         R"                  -  U	S S 2S S 2S 4   -  U R$                  S S S S 24   -  n
U
R'                  SS5      n
U
R)                  S5      n
[        R*                  " U
5      n[        R,                  " U
5      nS S S 5        UR.                  nWR1                  US	9WR1                  US	94$ ! , (       d  f       N8= f)
NmpscpuF)device_typeenabledr   )r   r   r   rh   r   rf   )rm   rJ   r\   r   r   typestrr   r   r,   r   trainingr   pos_embed_shiftpos_embed_jitterpos_embed_rescalemathpir   rp   tilecossinrg   ro   )r_   rb   _heightwidthr   r   r   r   patch_coordsanglesr   r   rg   s                 r1   ry   %Sapiens2RopePositionEmbedding.forward$  su   *001e++"8"88!7!77$$%/S%A%AfkkUZFZfkk`eUC :EMM&L }}A ++55;;77 KK99	  [<1d
#;;dmmDRVXYM>ZZF^^Aq)F[[^F))F#C))F#C+ D. ""vvEv"CFFF$7771 DCs   C7F==
G)r   rJ   r   r   r   )r&   r'   r(   r)   r,   r   r.   r   rO   r+   ry   r/   r|   r}   s   @r1   r   r     sG    ll5~ 5" 8ELL  8U5<<;U5V  8  8r0   r   RMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )Sapiens2RMSNormiG  epsrd   Nc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z.
Sapiens2RMSNorm is equivalent to T5LayerNorm
N)rN   rO   r   rP   r,   onesrn   variance_epsilon)r_   rR   r   r`   s      r1   rO   Sapiens2RMSNorm.__init__I  s/     	ll5::k#:; #r0   r7   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nrh   ri   T)keepdim)	rg   ro   r,   r   powmeanrsqrtr   rn   )r_   r7   input_dtypevariances       r1   ry   Sapiens2RMSNorm.forwardQ  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r0   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r+   rn   rm   r   r_   s    r1   
extra_reprSapiens2RMSNorm.extra_reprX  s*    ))*+6$2G2G1HIIr0   )r   rn   )gư>)r&   r'   r(   r)   floatrO   r,   r   ry   r   r/   r|   r}   s   @r1   r   r   G  sB    $ $$ $ $;U\\ ;ell ;J Jr0   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nri   rh   rj   )rm   r,   ru   )xx1x2s      r1   rotate_halfr   \  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r0   modulequerykeyvalueattention_maskdropoutscalingsoftcapc                 j   Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb  X-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R                  5       nX4$ )N      rh   r   ri   )rk   rg   )pr   r   )r   	repeat_kvnum_key_value_groupsr,   matmulrq   tanhr   
functionalsoftmaxr   ro   rg   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputs                r1   eager_attention_forwardr   c  s    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!#4 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r0   qkr   r   c                 <   U R                   S   nUR                   S   nXV-
  nU R                  Xv4SS9u  pUR                  Xv4SS9u  pX-  [        U	5      U-  -   n	X-  [        U5      U-  -   n[        R                  " X4SS9n [        R                  " X4SS9nX4$ )a  Applies Rotary Position Embedding to the query and key tensors, but only to the patch tokens,
ignoring the prefix tokens (cls token and register tokens).

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.

Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
rj   )rm   splitr   r,   ru   )r   r   r   r   r   
num_tokensnum_patchesnum_prefix_tokensq_prefix_tokens	q_patchesk_prefix_tokens	k_patchess               r1   apply_rotary_pos_embr    s      J))B-K"0!"*;)Ir!RO!"*;)Ir!RO [%;c%ABI[%;c%ABI		?.B7A		?.B7A4Kr0   r7   n_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rm   rt   reshape)r7   r  batchnum_key_value_headsslenr   s         r1   r   r     s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr0   c                     ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\R                  S-  S	\
\R                  \R                  4   S-  S
\\   S\
\R                  \R                  S-  4   4
S jjrSrU =r$ )Sapiens2Attentioni  zA
Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
rJ   	layer_idxc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        SU l        U R                  S-  U l	        SU l        UR                  U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                   S9U l        UR$                  U   U l        U R                  U R&                  -  U l        [        R                  " U R                  U R&                  U R                  -  UR*                  S9U l        [        R                  " U R                  U R&                  U R                  -  UR.                  S9U l        UR2                  (       a  [5        U R                  UR6                  S9O[        R8                  " 5       U l        UR2                  (       a$  [5        U R                  UR6                  S9U l        g [        R8                  " 5       U l        g )NFr   biasr   )rN   rO   rJ   rR   	embed_dimr   	num_headsr   	is_causalr   attention_dropoutr   r   Linear
query_biasq_proj	proj_biaso_projnum_key_value_heads_per_layerr	  r   key_biask_proj
value_biasv_projuse_qk_normr   rms_norm_epsIdentityq_normk_normr_   rJ   r  r`   s      r1   rO   Sapiens2Attention.__init__  s   ++33$..8}}d*//iiVEVEVWiiVEUEUV#)#G#G	#R $(NNd6N6N$N!ii0H0H4==0X_e_n_noii0H0H4==0X_e_p_pqQWQcQcodmm9L9LMikititivQWQcQcodmm9L9LMikititivr0   Nr7   r   position_embeddingsr   rd   c                 
   UR                   SS n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      nU R                  U5      nUu  p[        XxX5      u  px[        R                  " U R                  R                  [        5      nU" U UUU	U4U R                  (       d  SOU R                   U R"                  S.UD6u  pUR$                  " / UQSP76 R'                  5       nU R)                  U5      nX4$ )z#Input shape: Batch x Time x ChannelNri   r   rh           )r   r   )rm   r   r  viewrq   r  r  r#  r$  r  r   get_interfacerJ   _attn_implementationr   r   r   r   r  r   r  )r_   r7   r   r'  r   input_shapehidden_shapequery_statesr   r   r   r   attention_interfacer   r   s                  r1   ry   Sapiens2Attention.forward  sv    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&#7RU#[ (?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
! "));;;;FFHkk+.((r0   )rJ   r   r  r   r  r$  r  r  r   r	  r  r#  r  r   r  NNr&   r'   r(   r)   r*   r   r   rO   r,   r   r+   r   r   ry   r/   r|   r}   s   @r1   r  r    s    w~ w# w2 /3HL	$)||$) t+$) #5<<#=>E	$)
 +,$) 
u||U\\D00	1$) $)r0   r  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )Sapiens2LayerScalei  rd   c                    > [         TU ]  5         [        R                  " UR                  [
        R                  " UR                  5      -  5      U l        g r{   )	rN   rO   r   rP   layerscale_valuer,   r   rR   lambda1r^   s     r1   rO   Sapiens2LayerScale.__init__  s8    ||F$;$;ejjI[I[>\$\]r0   hidden_statec                     XR                   -  $ r{   r8  )r_   r:  s     r1   ry   Sapiens2LayerScale.forward  s    ll**r0   r<  rd   N)
r&   r'   r(   r)   rO   r,   r   ry   r/   r|   r}   s   @r1   r5  r5    s)    ^+ELL +U\\ + +r0   r5  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Sapiens2MLPi  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [        UR                     U l        g Nr  )rN   rO   rJ   rR   intermediate_sizer   r  mlp_biasup_proj	down_projr
   
hidden_actact_fnr^   s     r1   rO   Sapiens2MLP.__init__  s    !--!'!9!9yy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r0   c                 `    U R                  U R                  U R                  U5      5      5      $ r{   )rF  rH  rE  )r_   r   s     r1   ry   Sapiens2MLP.forward  s"    ~~dkk$,,q/:;;r0   )rH  rJ   rF  rR   rC  rE  r&   r'   r(   r)   rO   ry   r/   r|   r}   s   @r1   r@  r@    s    0< <r0   r@  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Sapiens2GatedMLPi  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g rB  )rN   rO   rJ   rR   rC  r   r  rD  	gate_projrE  rF  r
   rG  rH  r^   s     r1   rO   Sapiens2GatedMLP.__init__  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r0   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r{   )rF  rH  rP  rE  )r_   r   rF  s      r1   ry   Sapiens2GatedMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r0   )rH  rJ   rF  rP  rR   rC  rE  rL  r}   s   @r1   rN  rN    s    0 r0   rN  c                      ^  \ rS rSrSrSS\SS4U 4S jjjrS\R                  S\R                  4S jr	S\
4S	 jrS
rU =r$ )Sapiens2DropPathi  zStochastic depth (DropPath) per sample, for residual blocks.

Identity when ``drop_prob`` is 0 or outside training. See `Deep Networks with Stochastic Depth
<https://arxiv.org/abs/1603.09382>`_.
	drop_probrd   Nc                 .   > [         TU ]  5         Xl        g r{   )rN   rO   rV  )r_   rV  r`   s     r1   rO   Sapiens2DropPath.__init__"  s    "r0   r7   c                 V   U R                   S:X  d  U R                  (       d  U$ SU R                   -
  nUR                  S   4SUR                  S-
  -  -   n[        R
                  " X1R                  UR                  S9n[        R                  " XB-   5      nUR                  U5      U-  $ )Nr)  r   r   )r   r   )
rV  r   rm   ndimr,   randrg   r   floordiv)r_   r7   	keep_probrm   random_tensors        r1   ry   Sapiens2DropPath.forward&  s    >>S   &	$$Q')DM4F4F4J,KK

50C0CML`L`aM$=>  +m;;r0   c                      SU R                    3$ )Nzp=rV  r   s    r1   r   Sapiens2DropPath.extra_repr/  s    DNN#$$r0   rb  )r)  )r&   r'   r(   r)   r*   r   rO   r,   r   ry   r   r   r/   r|   r}   s   @r1   rU  rU    sL    #% #$ # #<U\\ <ell <%C % %r0   rU  c                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\R                  S-  S	\
\R                  \R                  4   S-  S
\\   S\R                  4
S jjrSrU =r$ )Sapiens2Layeri3  zCThis corresponds to the Block class in the original implementation.rJ   r  c                    > [         TU ]  5         [        UR                  UR                  S9U l        [        XS9U l        [        U5      U l	        UR                  S:  a  [        UR                  5      O[        R                  " 5       U l        [        UR                  UR                  S9U l        UR                   (       a  [#        U5      U l        O['        U5      U l        [        R                  " 5       U l        g )Nr  r  r)  )rN   rO   r   rR   r!  norm1r  	attentionr5  layer_scale1drop_path_raterU  r   r"  	drop_pathnorm2use_gated_mlprN  mlpr@  layer_scale2r%  s      r1   rO   Sapiens2Layer.__init__6  s    $V%7%7V=P=PQ
*6G.v6DJDYDY\_D_)&*?*?@egepeper$V%7%7V=P=PQ
'/DH"6*DHKKMr0   Nr7   r   r'  r   rd   c                 8   UnU R                  U5      nU R                  " U4UUS.UD6u  pU R                  U5      nU R                  U5      U-   nUnU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      U-   nU$ )N)r   r'  )rh  ri  rj  rl  rm  ro  rp  )r_   r7   r   r'  r   residualr   s          r1   ry   Sapiens2Layer.forwardD  s     !

=1>>
) 3
 	
 ))-8}5@ !

=1/))-8}5@r0   )ri  rl  rj  rp  ro  rh  rm  r2  r3  r}   s   @r1   re  re  3  s    M*~ *# *" /3HL	|| t+ #5<<#=>E	
 +, 
 r0   re  c                      ^  \ rS rSrSr         SS\S\S\\\\4   -  S\S\\\\4   -  \-  S\S	\S
\S\S\S\4U 4S jjjr	S\
R                  S\
R                  4S jrSrU =r$ )Sapiens2ConvLayeria  z[
A basic wrapper for Convolution-BatchNorm-Activation, typically used for head components.
in_channelsout_channelsrL   rM   paddinggroups
activationr  convolution_transposepixel_shufflescale_factorc           
      8  > [         TU ]  5         U	(       a  [        R                  " UUUUS9U l        O[        R
                  " UUUUUUUS9U l        [        R                  " U5      U l        [        U   U l	        U	(       a.  [        R                  " UU
(       a  X+S-  -  OUUUUUUS9U l        O-[        R
                  " UU
(       a  X+S-  -  OUUUUUUS9U l        U
(       a  [        R                  " U5      U l        g [        R                  " 5       U l        g )N)rw  rx  rL   rM   )rw  rx  rL   rM   ry  rz  r  rh   )rL   rM   ry  r  rz  )rN   rO   r   ConvTranspose2dconvolutionrZ   InstanceNorm2dnormr
   rH  PixelShuffler"  r}  )r_   rw  rx  rL   rM   ry  rz  r{  r  r|  r}  r~  r`   s               r1   rO   Sapiens2ConvLayer.__init__f  s
    	 !11')'	 D  "yy')' D %%l3	Z( !112?Q.\' D  "yy2?Q.\' D ?LR__\:QSQ\Q\Q^r0   r7   rd   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r{   )r  r}  r  rH  r_   r7   s     r1   ry   Sapiens2ConvLayer.forward  sD    ((7**=9		-0M2r0   )rH  r  r  r}  )	r   r   r   r   siluTFFrh   )r&   r'   r(   r)   r*   r   r+   r   boolrO   r,   r   ry   r/   r|   r}   s   @r1   rv  rv  a  s     .//0 &+#6_6_ 6_ 5c?*	6_
 6_ uS#X&,6_ 6_ 6_ 6_  $6_ 6_ 6_ 6_pU\\ ell  r0   rv  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Sapiens2Headi  rJ   c                   >^ [         TU ]  5         TR                  R                  (       a   [	        TR
                  TR
                  SSS9O[        R                  " 5       U l        TR
                  /TR                  R                  S S -   n[        R                  " U4S j[        UTR                  R                  TR                  R                  5       5       5      U l        TR                  R                  S   /TR                  R                  S S -   n[        R                  " U4S j[        UTR                  R                  TR                  R                  5       5       5      U l        TR                  R                  (       a  TR                  R                  S   O?TR                  R                  (       a  TR                  R                  S   OTR
                  n[        R"                  " UTR$                  SS9U l        g )Nr   r   rL   ry  ri   c              3   p  >#    U  H  u  pn[        UUUTR                  R                  (       a  S OSTR                  R                  (       a  US -
  S-  OS [        TR                  R                  5      [        TR                  R                  5      TR                  R                  (       + S9v   M     g7f)r   rh   )rL   rM   ry  r  r}  r|  N)rv  head_configuse_pixel_shuffler  .0in_chout_chrL   rJ   s       r1   	<genexpr>(Sapiens2Head.__init__.<locals>.<genexpr>  s      -
/*{ '"..@@qa282D2D2V2VqQ.\]&,,>>?"6#5#5#G#GH*0*<*<*N*N&N	/s   B3B6c              3      >#    U  H7  u  pn[        UUUTR                  R                  (       a  US -
  S-  OSS9v   M9     g7f)r   rh   r   r  N)rv  r  r  r  s       r1   r  r    sL      
)
/*{ '282D2D2V2VqQ.\]	/s   ?A)rL   )rN   rO   r  r  rv  rR   r   r"  
input_convupsample_out_channels
ModuleListzipupsample_kernel_sizesupsample_layersconv_out_channelsconv_kernel_sizesconv_layersrZ   
num_labels	predictor)r_   rJ   upsample_in_channelsconv_in_channelspredictor_inr`   s    `   r1   rO   Sapiens2Head.__init__  s    !!33 f00&2D2DRS]^_ 	
 !' 2 23f6H6H6^6^_b`b6cc!}} -
 /2$""88""88/-
  
" #..DDRHIFL^L^LpLpqtrtLuu== 
)
 /2 &"4"4"F"FHZHZHlHl/
)
 

 !!33 004 !!77 ##99"=## 	 <1B1BPQRr0   r7   rd   c                     U R                  U5      nU R                   H  nU" U5      nM     U R                   H  nU" U5      nM     U R                  U5      $ r{   )r  r  r  r  r_   r7   layers      r1   ry   Sapiens2Head.forward  sS    6))E!-0M *%%E!-0M &~~m,,r0   )r  r  r  r  r&   r'   r(   r)   r   rO   r,   r   ry   r/   r|   r}   s   @r1   r  r    s2    ,S~ ,S\-U\\ -ell - -r0   r  c                   b   ^  \ rS rSrS\S\S\R                  SS4U 4S jjrS\S\4S	 jr	S
r
U =r$ )Sapiens2PointmapFinalLayerBlocki  in_dimout_dimr{  rd   Nc                    > [         TU ]  5         [        R                  " [        R                  " X5      U/5      U l        g r{   )rN   rO   r   r  r  layers)r_   r  r  r{  r`   s       r1   rO   (Sapiens2PointmapFinalLayerBlock.__init__  s,    mmRYYv%?$LMr0   inputc                 @    UnU R                    H  nU" U5      nM     U$ r{   r  )r_   r  r:  r  s       r1   ry   'Sapiens2PointmapFinalLayerBlock.forward  s%    [[E .L !r0   r  )r&   r'   r(   r)   r   r   ModulerO   r   ry   r/   r|   r}   s   @r1   r  r    sE    Ns NS Nbii ND NV   r0   r  c            	          ^  \ rS rSrSS\S\\\4   S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )Sapiens2PointmapFinalLayeri  r  hidden_sizesr  r{  c                    > [         TU ]  5         [        R                  " 5       U l        [        XS   [        U   S9U l        [        US   US   [        U   S9U l        [        R                  " US   U5      U l
        g )Nr   )r  r  r{  r   )rN   rO   r   Flattenrp   r  r
   block1block2r  proj)r_   r  r  r  r{  r`   s        r1   rO   #Sapiens2PointmapFinalLayer.__init__  so    zz|5?vj?Q
 6?LOzHZ
 IIl1ow7	r0   r7   rd   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      $ r{   )rp   r  r  r  r  s     r1   ry   "Sapiens2PointmapFinalLayer.forward  s;    ]3M2M2yy''r0   )r  r  rp   r  )r   r  )r&   r'   r(   r)   r   r+   r   rO   r,   r   ry   r/   r|   r}   s   @r1   r  r    sR    	8s 	8%S/ 	8C 	8ad 	8 	8(U\\ (ell ( (r0   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Sapiens2PointmapScaleHeadi  rJ   c                   > [         TU ]  5         [        R                  " 5       U l        UR
                  /UR                  R                  S S -   n[        UUR                  R                  UR                  R                  5       H1  u  p4nU R                  R                  [        X4USUS-
  S-  S95        M3     [        UR                  R                  UR                  R                  UR                  S9U l        g )Nri   rh   r   )rL   rM   ry  )r{  )rN   rO   r   r  r  rR   r  scale_conv_out_channelsr  scale_conv_kernel_sizesappendrv  r  scale_final_input_sizescale_final_hidden_sizesrG  r  )r_   rJ   scale_in_channelsr  r  rL   r`   s         r1   rO   "Sapiens2PointmapScaleHead.__init__  s    ==?#//063E3E3]3]^a_a3bb*-6666+
&E;
 ##!%[QR]hkl]lqr\rs+
 45577((
r0   r7   rd   c                 Z    U R                    H  nU" U5      nM     U R                  U5      $ r{   r  r  r  s      r1   ry   !Sapiens2PointmapScaleHead.forward  s+    %%E!-0M &~~m,,r0   r  r  r}   s   @r1   r  r    s/    
~ 
$-U\\ -ell - -r0   r  c                      ^  \ rS rSr% \\S'   SrSrSrSr	S/r
SrSrSrSr\\S.rS	/rS
/r\R*                  " 5       SU 4S jj5       rSrU =r$ )Sapiens2PreTrainedModeli  rJ   modelrb   )imageTre  )r7   r8   periodsrV   c           	        > [         TU ]  U5        [        U[        R                  [        R
                  45      (       a5  [        R                  " UR                  SU R                  R                  S9  g[        U[        R                  5      (       a!  [        R                  " UR                  SSS9  g[        U[        5      (       a  [        R                  " UR                  SU R                  R                  S9  UR                  R                  S:  a4  [        R                  " UR                   SU R                  R                  S9  UR                  R"                  (       a!  [        R$                  " UR&                  5        gg[        U[(        5      (       a6  [        R*                  " UR,                  U R                  R.                  5        g[        U[0        5      (       ad  SUR2                  [4        R6                  " SSSUR8                  -  [4        R:                  S	9-  -  n[        R<                  " UR>                  U5        g[        U[@        [B        45      (       a  URE                  5        H  n[        U[        R
                  5      (       a"  [        R                  " UR                  SSS9  MD  [        U[        R                  5      (       d  Me  [        R                  " UR                  S
SS9  M     gg)zInitialize the weightsr)  )r   stdfan_outrelu)modenonlinearityr   r   r   rf   fan_inlinearN)#rN   _init_weightsr   r   r  rZ   inittrunc_normal_rn   rJ   initializer_ranger  kaiming_normal_rH   rS   rX   rY   rT   zeros_rV   r5  	constant_r8  r7  r   r   r,   r   r   r   copy_r   r  r  modules)r_   r   r   head_moduler`   s       r1   r  %Sapiens2PreTrainedModel._init_weights,  s    	f%fryy"))455v}}3DKK<Y<YZ 2 233  YVT 233v//ct{{?\?\]}}0014""6#9#9IfIfg}}++F--. , 233NN6>>4;;+G+GH =>>6;;%,,q!Q=PX]XeXe*fffHJJv1/H IJJ%~~/k29955((););)Z`aRYY77((););(Yab	  0 Kr0   r%   r>  )r&   r'   r(   r)   r   r.   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendre  r  _can_record_outputs"_keys_to_ignore_on_load_unexpected_keys_to_ignore_on_load_missingr,   no_gradr  r/   r|   r}   s   @r1   r  r    sz    $O!&*#()N"&&' +5&'4o#
]]_c cr0   r  c                      ^  \ rS rSrS\4U 4S jjr\\" SS9 SS\R                  S\
\R                  \R                  4   S-  S	\\   S
\4S jj5       5       rSrU =r$ )Sapiens2EncoderiG  rJ   c           
         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        XS9PM     sn5      U l        U R                  5         g s  snf )Nrg  )	rN   rO   r   r  rangenum_hidden_layersre  r  	post_initr%  s      r1   rO   Sapiens2Encoder.__init__H  sV     ]]INvOgOgIhiIhI]67Ihi

 	 js   A(F)tie_last_hidden_statesNr7   r'  r   rd   c                 P    U R                    H  nU" U4SU0UD6nM     [        US9$ )Nr'  )last_hidden_state)r  r   )r_   r7   r'  r   layer_modules        r1   ry   Sapiens2Encoder.forwardP  s5     !JJL(jL_jcijM ' ??r0   )r  r{   )r&   r'   r(   r)   r   rO   r   r   r,   r   r+   r   r   r   ry   r/   r|   r}   s   @r1   r  r  G  s    ~   E2 IM	@||	@ #5<<#=>E	@ +,		@
 
	@ 3  	@r0   r  c                      ^  \ rS rSrS\4U 4S jjrS r\\ SS\	R                  S\	R                  S-  S\\   S	\4S
 jj5       5       rSrU =r$ )Sapiens2Modeli^  rJ   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        [        UR                  UR                  S9U l        SU l        U R                  5         g )Nr  F)rN   rO   rH   rx   r   rope_embeddingsr  r  r   rR   r!  r  gradient_checkpointingr  r^   s     r1   rO   Sapiens2Model.__init__`  sa     ,V4<VD$V,
#F$6$6F<O<OP	&+#r0   c                 .    U R                   R                  $ r{   rx   r]   r   s    r1   get_input_embeddings"Sapiens2Model.get_input_embeddingsj      ///r0   Nrb   rc   r   rd   c                 h   UR                  U R                  R                  R                  R                  5      nU R                  XS9nU R                  U5      nU R                  " XE40 UD6nU R                  UR                  5      nUSS2SSS24   n[        UUUR                  UR                  S9$ )a@  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
    pre-training.

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-pretrain-0.4b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-pretrain-0.4b")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> cls_token = outputs.pooler_output
>>> cls_token.shape
torch.Size([1, 1024])
```
)rc   Nr   )r  pooler_outputr7   r8   )ro   rx   r]   rn   rg   r  r  r  r  r   r7   r8   )	r_   rb   rc   r   r7   r'  outputsequence_outputpooled_outputs	            r1   ry   Sapiens2Model.forwardm  s    D $t'G'G'N'N'T'TUV"22<@MI&I))F$<$<='1a0)-' ..((	
 	
r0   )rx   r  r  r  r  r{   )r&   r'   r(   r)   r   rO   r
  r   r   r,   r   r   r   r   ry   r/   r|   r}   s   @r1   r  r  ^  sn    ~ 0  04-
ll-
 ,-
 +,	-

 
$-
  -
r0   r  c            	          ^  \ rS rSrS\4U 4S jjrS r\\\	S\
R                  S\\   S\4S j5       5       5       rS	rU =r$ )
Sapiens2Backbonei  rJ   c                 p  > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        [        UR                  UR                  S9U l        SU l        [        UR                  S-   5       Vs/ s H  o!R                  PM     snU l        U R!                  5         g s  snf )Nr  Fr   )rN   rO   rH   rx   r   r  r  r  r   rR   r!  r  r  r  r  num_featuresr  )r_   rJ   r   r`   s      r1   rO   Sapiens2Backbone.__init__  s     ,V4<VD$V,
#F$6$6F<O<OP	&+#9>v?W?WZ[?[9\]9\A//9\] ^s   B3c                 .    U R                   R                  $ r{   r	  r   s    r1   r
  %Sapiens2Backbone.get_input_embeddings  r  r0   rb   r   rd   c                    UR                  U R                  R                  R                  R                  5      nU R                  U5      nU R                  U5      nSUS'   U R                  " X440 UD6nUR                  nUR                  u  pxpU R                  R                  n[        U[        5      (       a  UOUS   n[        U[        5      (       a  UOUS   nX-  nX-  nS[        U R                  SS5      -   n[        U R                  SS5      n/ / nn[        [        U R                   U5      5       H  u  nu  nnU R                  R"                  (       a  U R%                  U5      nUU R&                  ;   d  MG  U(       a  UR)                  USS2SSS24   5        USS2US2SS24   nU R                  R*                  (       aA  UR-                  X~UUR                  S	   5      R/                  SS
SS5      R1                  5       nOUnUR)                  U5        M     [3        [5        U5      U(       a  [5        U5      OSUR                  UR6                  S9$ )a  
Example:

```python
>>> from transformers import AutoBackbone, AutoImageProcessor
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-pretrain-0.4b")
>>> model = AutoBackbone.from_pretrained("facebook/sapiens2-pretrain-0.4b")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs, return_class_token=True)

>>> outputs.feature_maps[0].shape
torch.Size([1, 1024, 64, 48])
>>> outputs.cls_tokens[0].shape
torch.Size([1, 1024])
```
Toutput_hidden_statesr   r   rX   return_class_tokenFNri   r   rh   )feature_mapsr$   r7   r8   )ro   rx   r]   rn   rg   r  r  r7   rm   rJ   r\   r   r   getattr	enumerater  stage_namesnormalize_backbone_outputsr  out_featuresr  reshape_hidden_statesr  permuter   r"   r+   r8   )r_   rb   r   r7   r'  r  stage_hidden_statesrv   r   image_heightimage_widthr\   r   r   num_patches_heightnum_patches_width
num_prefixr  r  r$   idx
stage_namer:  patch_tokensfeature_maps                            r1   ry   Sapiens2Backbone.forward  s   < $t'G'G'N'N'T'TU5"22<@)-%&MI&I$223?3E3E0
|[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm)9'7.CQGG
$T[[2FN#%rj/8T=M=MOb9c/d+C+*l{{55#yy6T...%%%l1a7&;<+Az{A,=>;;44$,,ZM^`l`r`rsu`vw Aq!,#   #/K##K0# 0e& &|,,>uZ(D ..((	
 	
r0   )rx   r  r  r  r  r  )r&   r'   r(   r)   r   rO   r
  r   r   r   r,   r   r   r   r"   ry   r/   r|   r}   s   @r1   r  r    sc    
~ 
0  F
llF
 +,F
 
 	F
  ! F
r0   r  zfacebook/sapiens2-seg-0.4b)
checkpointc                      ^  \ rS rSrS\4U 4S jjr\\ SS\R                  S\R                  S-  S\\   S\4S	 jj5       5       rS
rU =r$ )Sapiens2ForSemanticSegmentationi  rJ   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g r{   rN   rO   r  r  r  r  decode_headr  r^   s     r1   rO   (Sapiens2ForSemanticSegmentation.__init__  @      ++"6*
'/r0   Nrb   labelsr   rd   c                    Ub%  U R                   R                  S:X  a  [        S5      eU R                  " U40 UD6nUR                  u  pVpxU R                   R
                  n	[        U	[        5      (       a  U	OU	S   n
[        U	[        5      (       a  U	OU	S   nXz-  nX-  nUR                  SS2SU R                   R                  -   S24   nUR                  SS5      R                  USX5      nU R                  U5      nSnUb$  U R                  UX R                   R                  S9n[        UUUR                   UR"                  S9$ )	a`  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss.
    Indices should be in `[0, ..., config.num_labels - 1]`.
    If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-seg-0.4b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-seg-0.4b")

>>> inputs = image_processor(image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> outputs.logits.shape
torch.Size([1, 29, 1024, 768])
```
Nr   z/The number of labels should be greater than oner   rh   ri   )ignore_index)r5   logitsr7   r8   )rJ   r  rl   r  rm   r\   r   r   r  rX   rq   r  r5  loss_functionsemantic_loss_ignore_indexr   r7   r8   )r_   rb   r8  r   outputsrv   r   r   r   r\   r   r   patch_heightpatch_widthr-  r.  r;  r5   s                     r1   ry   'Sapiens2ForSemanticSegmentation.forward  sA   B $++"8"8A"=NOO**\4V4'3'9'9$
v[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm-+00A8W8W4W4Y1YZ",,Q2:::r<e!!+.%%ff;;CiCi%jD&!//))	
 	
r0   r5  r  r  r{   )r&   r'   r(   r)   r   rO   r   r   r,   r-   
LongTensorr   r   r   ry   r/   r|   r}   s   @r1   r2  r2    sm    ~   +/9
''9
   4'9
 +,	9

 
!9
  9
r0   r2  c                    US;  a  [        S5      eU R                  S:w  a  [        S5      eU R                  u  p4pVSnUS:X  a+  SnU R                  5       n U SS2SSS2S	4   * U SS2SSS2S	4'   U R	                  US
XuU5      n U R                  5       nUR                  S
5      u  pU SS2U
S	4   USS2U	S	4'   U SS2U	S	4   USS2U
S	4'   UR	                  X4XV45      nUR                  S
5      nU$ )a  Flip the flipped heatmaps back to the original form.

Args:
    output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
        The output heatmaps obtained from the flipped images.
    flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
        Pairs of keypoints which are mirrored (for example, left ear -- right ear).
    target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
        Target type to use. Can be gaussian-heatmap or combined-target.
        gaussian-heatmap: Classification target with gaussian distribution.
        combined-target: The combination of classification target (response map) and regression target (offset map).
        Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

Returns:
    torch.Tensor: heatmaps that flipped back to the original image
)gaussian-heatmapcombined-targetz9target_type should be gaussian-heatmap or combined-targetr   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   rF  r   N.ri   )rl   rZ  rm   cloner  unbindflip)output_flipped
flip_pairstarget_typerv   num_keypointsr   r   channelsoutput_flipped_backleft_indicesright_indicess              r1   	flip_backrR  C  s4   " AATUUa^__/=/C/C,JvH'''--/(6q!$Q$|(D'Dq!$Q$|$#++JHeTN(..0 #-"3"3B"7L0>q-QT?T0U<,-1?<QT@T1U=#-.-55zRX6`a-2226r0   zfacebook/sapiens2-pose-0.4bz
    The Sapiens2 model with a pose estimation head on top (a set of heatmap predictors on top of the hidden states output).
    )r0  r    c                      ^  \ rS rSrS\4U 4S jjr\\  SS\R                  S\R                  S-  S\R                  S-  S\\   S	\4
S
 jj5       5       rSrU =r$ )Sapiens2ForPoseEstimationim  rJ   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g r{   r4  r^   s     r1   rO   "Sapiens2ForPoseEstimation.__init__t  r7  r0   Nrb   rK  r8  r   rd   c                 0   U R                   " U40 UD6nUR                  u  pgpU R                  R                  n
[	        U
[
        5      (       a  U
OU
S   n[	        U
[
        5      (       a  U
OU
S   nX-  nX-  nUR                  SS2SU R                  R                  -   S24   nUR                  SS5      R                  USX5      nU R                  U5      nUb  [        UU5      nSnUb  [        S5      e[        UUUR                  UR                  S9$ )av  
flip_pairs (`torch.Tensor` of shape `(num_pairs, 2)`, *optional*):
    Pairs of keypoints which are mirrored (for example, left ear -- right ear), used for
    test-time flip augmentation. When provided, the model assumes `pixel_values` contains
    horizontally-flipped images and calls `flip_back` on the output heatmaps to restore the
    original orientation.
labels (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`, *optional*):
    Heatmap ground truth for computing the loss.

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-pose-0.4b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-pose-0.4b")

>>> boxes = [[[270.8, 0.6, 294.1, 379.5]]]
>>> inputs = image_processor(image, boxes=boxes, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> outputs.heatmaps.shape
torch.Size([1, 308, 256, 192])
```
r   r   Nrh   ri   Training is not yet supported)r5   r6   r7   r8   )r  rm   rJ   r\   r   r   r  rX   rq   r  r5  rR  NotImplementedErrorr3   r7   r8   )r_   rb   rK  r8  r   r>  rv   r   r   r   r\   r   r   r?  r@  r-  r.  r6   r5   s                      r1   ry   !Sapiens2ForPoseEstimation.forward{  s   L **\4V4'3'9'9$
v[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm-+00A8W8W4W4Y1YZ",,Q2:::r<e##K0! :6H%&EFF*!//))	
 	
r0   rB  r2  )r&   r'   r(   r)   r   rO   r   r   r,   r-   r   r   r   r3   ry   r/   r|   r}   s   @r1   rT  rT  m  s    ~   +/+/	=
''=
 LL4'=
 !!D(	=

 +,=
 
%=
  =
r0   rT  zfacebook/sapiens2-normal-0.4bz
    The Sapiens2 model with a normal estimation head on top (a PixelShuffle-based decoder that predicts surface normal maps).
    c                      ^  \ rS rSrS\4U 4S jjr\\ SS\R                  S\R                  S-  S\
\   S\4S	 jj5       5       rS
rU =r$ )Sapiens2ForNormalEstimationi  rJ   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g r{   r4  r^   s     r1   rO   $Sapiens2ForNormalEstimation.__init__  r7  r0   Nrb   r8  r   rd   c                    U R                   " U40 UD6nUR                  u  pVpxU R                  R                  n	[	        U	[
        5      (       a  U	OU	S   n
[	        U	[
        5      (       a  U	OU	S   nXz-  nX-  nUR                  SS2SU R                  R                  -   S24   nUR                  SS5      R                  USX5      nU R                  U5      nSnUb  [        S5      e[        UUUR                  UR                  S9$ )a  
labels (`torch.FloatTensor` of shape `(batch_size, num_labels, height, width)`, *optional*):
    Ground-truth surface normal maps for computing the loss.

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-normal-0.4b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-normal-0.4b")

>>> inputs = image_processor(image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> outputs.normals.shape
torch.Size([1, 3, 1024, 768])
```
r   r   Nrh   ri   rX  )r5   r<   r7   r8   )r  rm   rJ   r\   r   r   r  rX   rq   r  r5  rY  r:   r7   r8   )r_   rb   r8  r   r>  rv   r   r   r   r\   r   r   r?  r@  r-  r.  r<   r5   s                     r1   ry   #Sapiens2ForNormalEstimation.forward  s
   > **\4V4'3'9'9$
v[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm-+00A8W8W4W4Y1YZ",,Q2:::r<e"";/%&EFF,!//))	
 	
r0   rB  r{   )r&   r'   r(   r)   r   rO   r   r   r,   r-   r   r   r:   ry   r/   r|   r}   s   @r1   r\  r\    sm    ~   ,04
''4
 !!D(4
 +,	4

 
'4
  4
r0   r\  zfacebook/sapiens2-pointmap-0.4bz
    The Sapiens2 model with a pointmap head on top (a PixelShuffle-based decoder that predicts per-pixel 3D XYZ
    coordinates, plus an optional scale branch for focal-length normalization).
    c                      ^  \ rS rSrS\4U 4S jjr\\ SS\R                  S\R                  S-  S\
\   S\4S	 jj5       5       rS
rU =r$ )Sapiens2ForPointmapEstimationi  rJ   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        UR                  b"  UR                  R                  b  [        U5      O[        R                  " 5       U l        U R                  5         g r{   )rN   rO   r  r  r  r5  r  r  r  r   r"  
scale_headr  r^   s     r1   rO   &Sapiens2ForPointmapEstimation.__init__  sn     "6*
'/ !!-&2D2D2\2\2h &f- 	
 	r0   Nrb   r8  r   rd   c                    U R                   " U40 UD6nUR                  u  pVpxU R                  R                  n	[	        U	[
        5      (       a  U	OU	S   n
[	        U	[
        5      (       a  U	OU	S   nXz-  nX-  nUR                  SS2SU R                  R                  -   S24   nUR                  SS5      R                  USX5      nU R                  U5      n[	        U R                  [        R                  5      (       a  SOU R                  U5      nSnUb  [        S5      e[        UUUUR                   UR"                  S9$ )a  
labels (`torch.FloatTensor` of shape `(batch_size, 3, height, width)`, *optional*):
    Ground-truth pointmap for computing the loss.

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-pointmap-0.4b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-pointmap-0.4b")

>>> inputs = image_processor(image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> outputs.pointmaps.shape
torch.Size([1, 3, 1024, 768])
```
r   r   Nrh   ri   rX  )r5   r@   rA   r7   r8   )r  rm   rJ   r\   r   r   r  rX   rq   r  r5  rd  r   r"  rY  r>   r7   r8   )r_   rb   r8  r   r>  rv   r   r   r   r\   r   r   r?  r@  r-  r.  r@   rA   r5   s                      r1   ry   %Sapiens2ForPointmapEstimation.forward  s2   > **\4V4'3'9'9$
v[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm-+00A8W8W4W4Y1YZ",,Q2:::r<e$$[1	#DOOR[[AAtWbGc%&EFF.!//))
 	
r0   )r5  r  rd  r{   )r&   r'   r(   r)   r   rO   r   r   r,   r-   r   r   r>   ry   r/   r|   r}   s   @r1   rb  rb    sm    	~ 	  ,06
''6
 !!D(6
 +,	6

 
)6
  6
r0   rb  zfacebook/sapiens2-matting-1bz
    The Sapiens2 model with a matting head on top (a PixelShuffle-based decoder that predicts a
    pre-multiplied RGB foreground and an alpha matte).
    c                      ^  \ rS rSrS\4U 4S jjr\\ SS\R                  S\R                  S-  S\
\   S\4S	 jj5       5       rS
rU =r$ )Sapiens2ForImageMattingiR  rJ   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r{   )rN   rO   r  r  r  r5  r  r^   s     r1   rO    Sapiens2ForImageMatting.__init__Z  s3     "6*
'/r0   Nrb   r8  r   rd   c                 \   U R                   " U40 UD6nUR                  u  pVpxU R                  R                  n	[	        U	[
        5      (       a  U	OU	S   n
[	        U	[
        5      (       a  U	OU	S   nXz-  nX-  nUR                  SS2SU R                  R                  -   S24   nUR                  SS5      R                  USX5      nU R                  U5      R                  5       nUSS2SS24   nUSS2SS24   nSnUb  [        S5      e[        UUUUR                  UR                  S9$ )	a	  
labels (`torch.FloatTensor` of shape `(batch_size, 4, height, width)`, *optional*):
    Ground-truth matting targets for computing the loss.

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-matting-1b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-matting-1b")

>>> inputs = image_processor(image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> outputs.alphas.shape
torch.Size([1, 1, 1024, 768])
>>> outputs.foregrounds.shape
torch.Size([1, 3, 1024, 768])
```
r   r   Nrh   ri   r   rX  )r5   rE   rF   r7   r8   )r  rm   rJ   r\   r   r   r  rX   rq   r  r5  sigmoidrY  rC   r7   r8   )r_   rb   r8  r   r>  rv   r   r   r   r\   r   r   r?  r@  r-  r.  mattingrF   rE   r5   s                       r1   ry   Sapiens2ForImageMatting.forward`  s7   B **\4V4'3'9'9$
v[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm-+00A8W8W4W4Y1YZ",,Q2:::r<e"";/779a!enAB%&EFF)#!//))
 	
r0   )r5  r  r{   )r&   r'   r(   r)   r   rO   r   r   r,   r-   r   r   rC   ry   r/   r|   r}   s   @r1   ri  ri  R  sm    ~   ,09
''9
 !!D(9
 +,	9

 
$9
  9
r0   ri  )r2  rT  r\  rb  ri  r  r  r  )NNN)r)  NN)rE  )Yr   collections.abcr   r   dataclassesr   numpyr   r,   r   r    r	   r  activationsr
   backbone_utilsr   r   integrationsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   utils.genericr   r   r   utils.output_capturingr   configuration_sapiens2r   r"   r3   r:   r>   rC   r  rH   r   rg   r   r   r   r   r   r   r   r+   r   r  r   r  r5  r@  rN  rU  re  rv  r  r  r  r  r  r  r  r  r2  rR  rT  r\  rb  ri  __all__r%   r0   r1   <module>r     s=  &  . !    & ! H 7 9  G & @ 7 Y Y 5 2  7^ 7 7 
 <+ < <$ 
 <K < <* 
 <k < <0 
 1 1 1," "J %R0'*38;;HM
\\ 1<  	LL4< DL T\	
 \\:48BII 48n Y'Jbii J (J((   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%D||+0<<>Cll
5<<%&B	UU\\ 	U# 	U%,, 	U?)		 ?)D+ +<")) <ryy  %ryy %0+. +\B		 BJ5-299 5-p	bii 	( (&-		 -2 -co -c -c`@- @. =
+ =
 =
@ Y
}&= Y
 Y
x 78C
&= C
 9C
L'T ,G
 7 G
G
T .>
"9 >
>
B 0D
$; D
D
N -B
5 B
B
J	r0   