
    3j2[                       S SK JrJr  S SKJr  S SKJr  S SKrS SKJ	s  J
r  S SKJr  S SKJ	r	  S SKJ
r  S SKJr  S S	KJr  S
SKJr  S
SKJr  S
SKJr  S
SKJr  S
SKJrJr  S
SK J!r!J"r"J#r#J$r$J%r%J&r&J'r'J(r(  S
SK)J*r*J+r+J,r,  S
SK-J.r.J/r/  S
SK0J1r1  S
SK2J3r3J4r4J5r5J6r6  S
SK7J8r8  SSK9J:r:J;r;  SSK<J=r=  SSK>J?r?J@r@JArAJBrBJCrCJDrDJErEJFrFJGrGJHrH  SSKIJJrJ  SSKKJLrL  SSKMJNrN  SSKOJPrP  SSKQJRrRJSrS  SSKTJUrU  SS KVJWrWJXrX  \6R                  " \Z5      r[\5" S!S"9\ " S# S$\5      5       5       r\\5" S%S"9\ " S& S'\W5      5       5       r]\5" S(S"9\ " S) S*\+5      5       5       r^\5" S+S"9\ " S, S-\+5      5       5       r_\5" S.S"9\ " S/ S0\U5      5       5       r` S~S1\R                  S2\b\c\c4   S3\dS4\b\R                  \R                  4   4S5 jjre S~S6\R                  S1\R                  S2\b\c\c4   S3\dS4\R                  4
S7 jjrfSS8\R                  S9\cS4\R                  4S: jjrgS8\R                  S4\b\R                  \R                  4   4S; jrh SS<\R                  S8\R                  S=\cS4\R                  4S> jjri " S? S@\;SASB9rj " SC SD\:5      rk\5" SESF9\ " SG SH\5      5       5       rl\5" SISF9\ " SJ SK\=5      5       5       rm " SL SM\A5      rn " SN SO\G5      ro " SP SQ\L5      rp " SR SS\?5      rq " ST SU\D5      rr " SV SW\C5      rs " SX SY\P5      rt " SZ S[\	R                  5      rv " S\ S]\N5      rw " S^ S_\	R                  5      rx " S` Sa\	R                  5      ry " Sb Sc\F5      rz " Sd Se\B5      r{ " Sf Sg\E5      r| " Sh Si\@5      r}\5" SESF9 " Sj Sk\z5      5       r~\5" SlSmSn9 " So Sp\z5      5       r\5" SqSrSn9 " Ss St\z5      5       r\5" SuSvSn9 " Sw Sx\z5      5       r\5" SySzSn9 " S{ S|\z5      5       r/ S}Qrg)    )CallableIterable)	dataclass)UnionN)strict)nn)
functional)TorchvisionBackend)DINOv3ViTBackboneOutput   )initialization)ACT2FN)PreTrainedConfig)BatchFeature)group_images_by_shapereorder_images)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDChannelDimension
ImageInputPILImageResamplingSizeDict#get_image_size_for_max_height_widthmake_list_of_images)BaseModelOutputWithPoolingModelOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
TensorTypeTransformersKwargsauto_docstringlogging)can_return_tuple   )BeitImageProcessorBeitImageProcessorKwargs)DINOv3ViTConfig)
DINOv3ViTAttentionDINOv3ViTBackboneDINOv3ViTEmbeddingsDINOv3ViTEncoderDINOv3ViTLayerDINOv3ViTLayerScaleDINOv3ViTModelDINOv3ViTPreTrainedModelDINOv3ViTRopePositionEmbeddingapply_rotary_pos_emb)eager_attention_forward)LlamaRMSNorm)Mask2FormerPredictionBlock)"PPOCRV5ServerDetConvBatchnormLayer)box_xywh_to_cxcywhbox_xywh_to_xyxy)ImageMattingOutput)VitPoseEstimatorOutput	flip_backz
    Output type of [`Sapiens2Backbone`], extending [`BackboneOutput`] with optional CLS tokens from
    each selected feature stage (used when `config.return_class_token=True`).
    )custom_introc                       \ rS rSrSrg)Sapiens2BackboneOutputH    N__name__
__module____qualname____firstlineno____static_attributes__rA       g/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/sapiens2/modular_sapiens2.pyr?   r?   H   s     	rH   r?   z6
    Class for outputs of pose estimation models.
    c                       \ rS rSrSrSrg)Sapiens2PoseEstimatorOutputS   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Pose estimation loss.
heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
    Heatmaps as predicted by the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
    (also called feature maps) of the model at the output of each stage.
rA   N)rC   rD   rE   rF   __doc__rG   rA   rH   rI   rK   rK   S   s    	rH   rK   z8
    Class for outputs of normal estimation models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
Sapiens2NormalEstimatorOutputf   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Normal estimation loss.
normals (`torch.FloatTensor` of shape `(batch_size, num_labels, height, width)`):
    Raw normal map predictions as output by the model (unnormalized).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage)
    of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
    each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one per layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`. Attentions weights after the attention softmax.
Nlossnormals.hidden_states
attentionsrA   )rC   rD   rE   rF   rM   rQ   torchFloatTensor__annotations__rR   rS   tuplerT   rG   rA   rH   rI   rO   rO   f   sq     &*D%

d
")(,GU%,:>M5**C/047>7;Je'',-4;rH   rO   z:
    Class for outputs of pointmap estimation models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)Sapiens2PointmapEstimatorOutput   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Pointmap estimation loss.
pointmaps (`torch.FloatTensor` of shape `(batch_size, 3, height, width)`):
    Per-pixel 3D XYZ coordinate predictions in canonical camera space.
scales (`torch.FloatTensor` of shape `(batch_size, 1)`, *optional*):
    Canonical focal length / actual focal length ratio. `None` when no scale branch is configured.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage)
    of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of
    each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one per layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`. Attentions weights after the attention softmax.
NrQ   	pointmapsscales.rS   rT   rA   )rC   rD   rE   rF   rM   rQ   rU   rV   rW   r\   r]   rS   rX   rT   rG   rA   rH   rI   rZ   rZ      s      &*D%

d
")*.Iu  4'.'+FE$+:>M5**C/047>7;Je'',-4;rH   rZ   z4
    Class for outputs of image matting models.
    c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Sapiens2ImageMattingOutput   a"  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Loss.
alphas (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`):
    Estimated alpha values.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
    (also called feature maps) of the model at the output of each stage.
foregrounds (`torch.FloatTensor` of shape `(batch_size, 3, height, width)`):
    Pre-multiplied RGB foreground predictions in `[0, 1]` (sigmoid-activated).
NforegroundsrA   )
rC   rD   rE   rF   rM   ra   rU   rV   rW   rG   rA   rH   rI   r_   r_      s     -1K""T)0rH   r_   boxesoutput_sizepaddingreturnc           	         U R                  S5      u  p4pV[        R                  " X4/SS9nXR-  nXb-  n	Uu  pX-  n[        R                  " XU-  :  S   [        R                  " XU-  /SS9[        R                  " X-  U	/SS95      nX}4$ )a  Compute crop center and scale from bounding boxes, applying padding and aspect ratio correction.

Accepts either a single box `(4,)` or multiple boxes `(num_boxes, 4)` and returns center/scale with a matching
leading dimension.

Args:
    boxes (`torch.Tensor` of shape `(4,)` or `(num_boxes, 4)`): Bounding box in
        (center-x, center-y, width, height) format, with values in absolute pixel coordinates.
    output_size (`tuple[int, int]`): Target output size as `(height, width)`, used to compute
        the aspect ratio for scale correction.
    padding (`float`, *optional*, defaults to `1.25`): Multiplicative factor applied to the
        bounding box dimensions, adding context around the region of interest.

Returns:
    `tuple[torch.Tensor, torch.Tensor]`: A pair `(center, scale)` where `center` has shape
    `(..., 2)` with (x, y) in input-image pixel coordinates, and `scale` has shape `(..., 2)`
    with (width, height) in input-image pixels representing the dimensions of the padded,
    aspect-ratio-corrected crop window.
dim).N)unbindrU   stackwhere)rb   rc   rd   center_xcenter_ywidthheightcenterscaled_widthscaled_heightoutput_heightoutput_widthaspect_ratioscales                 rI   boxes_to_crop_paramsrx      s    0 ).R(8%H[[(-26F?L$M"-M/LKK	4	4i@\,#>?RH]1=ArJE
 =rH   imagec           	      "   Uu  pEU R                   u  pgn[        XUS9u  pU	R                  S5      u  pU
R                  S5      u  pUS-
  U-  nUS-
  U-  n[        R                  " UU5      S:  n[        R
                  " [        R                  " U[        R                  U R                  S9[        R                  " U[        R                  U R                  S9SS9u  nnUUSS2SS4   -  USS2SS4   -   S	USS2SS4   -  -
  nUUSS2SS4   -  USS2SS4   -   S	USS2SS4   -  -
  n[        R                  " S
U-  US-
  -  S-
  S
U-  US-
  -  S-
  /SS9nUR                   S   n[        R                  " UXdXPR                  U R                  S9nU R                  S5      nUS4U) S44 H[  u  nnUR                  5       (       d  M  [        R                  " UR!                  UR#                  5       SSS5      UU   USSS9UU'   M]     U$ )a  Crops and resizes bounding box regions from the input image to the target output size.

Applies padding and aspect ratio correction to each crop before resizing.
Uses bilinear interpolation for downscaling and bicubic for upscaling.

This implementation is equivalent to the cv2 affine warp with rotation=0 used in the original
Sapiens2 codebase. Rotation is always zero because we don't support rotated bounding boxes.

Args:
    image (`torch.Tensor`): Input image tensor of shape `(C, H, W)` in float32.
    boxes (`torch.Tensor`): Bounding boxes in (center-x, center-y, width, height) format,
        shape `(num_boxes, 4)`, with values in absolute pixel coordinates.
    output_size (`tuple[int, int]`): Target output size as `(height, width)`.
    padding (`float`, *optional*, defaults to `1.25`): Multiplicative factor applied to the
        bounding box dimensions before cropping, adding context around the region of interest.

Returns:
    `torch.Tensor`: Cropped and resized images of shape `(num_boxes, C, output_height, output_width)`.
)rc   rd   rg            ?dtypedeviceij)indexingN      ?g       @rh   r   r   r~   bilinearbicubiczerosT)modepadding_modealign_corners)shaperx   rj   rU   minimummeshgridarangefloat32r   rk   emptyr~   	unsqueezeanyFgrid_sampleexpandsum)ry   rb   rc   rd   rt   ru   num_channelsinput_heightinput_widthrq   rw   rm   rn   boxes_widthboxes_heightscale_xscale_yis_bilineargrid_ygrid_xin_xin_ygrids	num_boxesoutputimage_4dmaskr   s                               rI   crop_and_resizer      s(   2 #.M.3kk+L(QXYMFr*H %R 0Ka;.Gq L0G--1C7K^^]%--M\u||LNFF
 GAtTM**Xatm-DDs[YZ\`bfYfMgGggDGAtTM**Xatm-DDs\Z[]acgZgMhGhhDKKt{Q7#=sTz\\]M]?^ad?dekmnEAI[[LVbVbjojujuvF q!H#Z0K<2KL
d88::==
BB7d$"F4L M MrH   heatmapskernelc                    US-  S:X  d  US::  a  [        S5      eSUS-
  S-  S-
  -  S-   nUS-
  S-  nU R                  SS	9n[        R                  " XX3U4S
SS9n[        R
                  " XQU/X"/S9nUSS2X3* 2X3* 24   nUR                  SS	9n[        R                  " US:  U[        R                  " U5      5      n	[        R                  " US:  XI-  [        R                  " U5      5      n
XzSS2SS4   -  $ )a  Gaussian blur per-keypoint heatmap, preserving the original max value.

Matches cv2.GaussianBlur with sigma=0 which means that the sigma is automatically
computed from the kernel size.

Args:
    heatmaps: Shape `(K, height, width)`.
    kernel: Odd integer kernel size for the Gaussian blur. Must be greater than 1.

Returns:
    `torch.Tensor`: Blurred heatmaps of the same shape as the input.
r&   r   r{   z2Kernel size must be an odd integer greater than 1.g333333?r   g?)r{   r&   rh   constant        )r   value)kernel_sizesigmaN)	
ValueErroramaxr   padtvFgaussian_blurrU   rl   	ones_like)r   r   r   borderorigin_maxespaddedblurredresultresult_maxes
safe_maxesrw   s              rI   gaussian_blur_preserve_maxr     s    zQ&A+MNNFQJ#%)*S0EqjQF==V=,L UU8ff=JVYZFV4DUN[GQww67F;;6;*L\A-|U__\=Z[JKKq(,*CU__UaEbcE!T4-(((rH   c           
         U R                   u  pp4U R                  nU R                  XS5      nUR                  SS9nUR	                  SS9nX-  R                  5       n	X-  R                  5       n
[        R                  " UR                  S5      S:  [        R                  " X/SS9[        R                  " XS4SUS95      nX4$ )a+  Predict keypoint locations and confidence scores from heatmaps.

Args:
    heatmaps: Shape `(num_persons, num_keypoints, height, width)`.

Returns:
    locations: `(num_persons, num_keypoints, 2)` x/y in heatmap pixel coordinates.
    scores: `(num_persons, num_keypoints)` per-keypoint confidence.
rg   rh   r   r&   g      r   )r   r   reshaper   argmaxfloatrU   rl   r   rk   full)r   num_personsnum_keypoints_heatmap_widthr   heatmap_flatscores
flat_indexlocations_xlocations_y	locationss               rI   get_keypoint_predictionsr   9  s     4<>>0K__F##KCL2&F$$$,J-446K.557Ks"[.B7

K2DHI
 rH   	keypointsblur_kernel_sizec           	         UR                   u  p4pVUR                  n[        UR                  X4-  XV5      U5      R                  X4XV5      nUR	                  SS5      R                  5       n[        R                  " USSS9nUR                  5       n	US-   n
US-   nX-  nXL-  nU SS2SS2S4   R                  5       S	-   U SS2SS2S	4   R                  5       S	-   U-  -   nX[        R                  " XG[        R                  S
9SSS24   -  -   nX[        R                  " X7[        R                  S
9SS2S4   -  -   nUR                  S5      nSS	SUU* US	-   US	-   * S.nUR                  5        VVVs0 s H  u  u  nnnUU4XU-      _M     nnnnSUS   US   -
  -  nSUS   US   -
  -  nUS   SUS   -  -
  US   -   nUS   SUS   -  -
  US   -   nSUS   US   -
  US   -
  US   -   US   -   US   -
  US   -
  US   -   -  n[        R                  " UR                  5      R                   nUU-   nUU-   nUU-  UU-  -
  nUU-  UU-  -
  U-  nU* U-  UU-  -   U-  nU [        R"                  " UU/SS9-
  $ s  snnnf )a1  Sub-pixel refinement via Hessian on log-heatmaps (UDP Dark Pose).

Args:
    keypoints: Shape `(num_persons, num_keypoints, 2)` x/y in heatmap pixel coordinates.
    heatmaps: Shape `(num_persons, num_keypoints, height, width)`.

Returns:
    `(num_persons, num_keypoints, 2)` refined keypoint locations.
gMbP?g      I@)r{   r{   r{   r{   	replicate)r   r&   Nr   r{   r   rg   )r   r   r   r{   r   rg   r{   r   rg   r   r{   r{   rg   rg   r   r   r   r   r   r   r   r   rh   )r   r   r   r   clamplogr   r   flattenlongrU   r   r   itemsfinfor~   epscat)r   r   r   r   r   heatmap_heightr   r   heatmaps_paddedheatmaps_flattenedpadded_heightpadded_widthkeypoint_strideperson_strideindexposition_to_index_offsetdxdyoffsetheatmap_values
gradient_x
gradient_y
hessian_xx
hessian_yy
hessian_xyr   determinantoffset_xoffset_ys                                rI   "post_dark_unbiased_data_processingr   R  sN    AI=K__F)4nTVfgk.H  ~~dD)--/HeeHlEO(002"Q&M 1$L#2O!3MaAg##%)Yq!Qw-?-D-D-F-Jl,ZZEell=W\WaWa&bcgijcj&kkkEELLSXS]S]$^_`bf_f$gggEOOBE q  1$%  NfMkMkMmMm9I"b6R$V^44Mm   t,~e/DDEJt,~e/DDEJ%N4,@(@@>RWCXXJ%N4,@(@@>RWCXXJt

	

	 
	 
		
 
	  
	  
 	!	J ++j&&
'
+
+Cc!Jc!Jz)J,CCKZ'*z*AA[PHj(:
+BBkQHuyy(H!52>>>5s   5I?c                       \ rS rSrSrg)Sapiens2ImageProcessorKwargsi  rA   NrB   rA   rH   rI   r   r         rH   r   F)totalc            "         ^  \ rS rSr\r\R                  r\	r
\rSSS.rSrS\\   4U 4S jjr\  S2S	\S
\S-  S\\\\         S-  S\\   S\4
S jj5       r S3S	\S
\S-  S\\\\         S-  S\S\S\\-  S-  S\\S4   S-  S\4U 4S jjjr   S4S	\\R<                     S\S\SSS\S\S\S\S\S\\\   -  S-  S\\\   -  S-  S\S-  S \S!\S\\\\         S-  S\\R<                     4 S" jjr      S5S#\!S\\\\         S$\!S-  S%\"S&\S-  S'\\\#\"\"4      -  S-  S(\\\#\"\"4      -  S-  S\\\$\\R<                  4         4S) jjr%   S6S#\&S'\\\#\"\"4      -  S-  S(\\\#\"\"4      -  S-  S*\S-  S\\$\\R<                  4      4
S+ jjr'   S6S#\(S'\\\#\"\"4      -  S-  S(\\\#\"\"4      -  S-  S*\S-  S\\$\\R<                  4      4
S, jjr)  S2S#\*S(\\\#\"\"4      -  S-  S-\S-  S\\$\\R<                  4      4S. jjr+S/\R<                  S'\\\#\"\"4      -  S-  S(\\\#\"\"4      -  S-  S*\S-  S\\R<                     4
S0 jr,S1r-U =r.$ )7Sapiens2ImageProcessori     i   )rp   ro   Fkwargsc                 &   > [         TU ]  " S0 UD6  g NrA   )super__init__selfr   	__class__s     rI   r  Sapiens2ImageProcessor.__init__  s    "6"rH   Nimagessegmentation_mapsrb   re   c                 2    [         R                  " XU40 UD6$ )a  
segmentation_maps (`ImageInput`, *optional*):
    The segmentation maps to preprocess.
boxes (`list[list[list[float]]]` or `np.ndarray`, *optional*):
    List or array of bounding boxes for each image. Each box should be a list of 4 floats
    representing the bounding box coordinates in COCO format
    (top_left_x, top_left_y, width, height). When provided, each person crop is
    affine-warped to the model input size instead of resizing the full image.
)r
   
preprocess)r  r  r	  rb   r   s        rI   r  !Sapiens2ImageProcessor.preprocess  s    " ",,VXQWXXrH   do_convert_rgbinput_data_formatreturn_tensorsr   ztorch.devicec           
      <   > X8S'   [         T	U ]  " U 4UUUUUUS.UD6$ )z"Handle extra inputs beyond images.rb   )r  r	  r  r  r  r   )r  _preprocess_image_like_inputs)
r  r  r	  rb   r  r  r  r   r   r  s
            rI   r  4Sapiens2ImageProcessor._preprocess_image_like_inputs  sB      ww4	
/)/)	
 	
 		
rH   	do_resizesizeresamplez7PILImageResampling | tvF.InterpolationMode | int | Nonedo_center_crop	crop_size
do_rescalerescale_factordo_normalize
image_mean	image_stddisable_groupingdo_reduce_labelsdo_padc           	      X   Ub  US   US   4n/ n[        X5       H{  u  nn[        R                  " U[        R                  SS9n[        [        R                  " U[        R                  UR                  S95      nUR                  [        UUUS95        M}     UnSnU(       a  U R                  U5      n[        XS9u  nn0 nUR                  5        Ha  u  nnU(       aO  U(       a6  [        US   US   S9nU R                  UUU5      nU R                  UU5      nOU R                  UX45      nUUU'   Mc     [!        UU5      n[        UUS9u  nn0 nUR                  5        H8  u  nnU(       a  U R                  UU5      nU R#                  UXxXU5      nUUU'   M:     [!        UU5      $ )	Nrp   ro   Fr~   rw   r}   )rb   rc   )r  )
max_height	max_width)zipr   to_dtype_imagerU   r   r8   tensorr   extendr   reduce_labelr   r   r   resizecenter_cropr   rescale_and_normalize)r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rb   r   rc   cropsry   image_boxesboxes_tensorgrouped_imagesgrouped_images_indexresized_images_groupedr   stacked_imagesaspect_ratio_sizeresized_imagesprocessed_images_groupeds                                 rI   _preprocess"Sapiens2ImageProcessor._preprocess  s   & >4=9KE&)&&8"{**5US1%,,{RWR_R_hmhtht2uv_U,T_`a '9 FI&&v.F/DV/o,,!#%3%9%9%;!E>(0DNVZ[bVc(d%%)[[ART\%]N%)%5%5nd%KN%)[[%PN,:"5) &< ((>@TU/D^fv/w,,#% %3%9%9%;!E>!%!1!1.)!L!77
LV_N /=$U+ &< 68LMMrH   outputsoutputs_flippedr   	thresholdsource_sizestarget_sizesc                     [        U[        R                  5      (       a  UR                  5       n[        U[        R                  5      (       a  UR                  5       n[	        U5      nUb  Uc  [        S5      eUb  U[	        U5      :w  a  [        S5      eUb  U[	        U5      :w  a  [        S5      eUR                  n	Ub  XR                  -   S-  n	U	R                  n
U	R                  u  ppUS:X  a  U Vs/ s H  n/ PM     sn$ [        R                  " U VVs/ s H  nU  H  nUPM     M     snn[        R                  U
S9nU	R                  5       n	[        U	5      u  nn[        UXS9n[        [        U5      U R                   S	   U R                   S
   4S9u  nn[        R                  " US-
  US-
  /[        R                  U
S9nUU-  USS2SSS24   -  USS2SSS24   -   SUSS2SSS24   -  -
  n[#        U5      nUb  Ub  [        R                  " [%        Xg5       VVVVs/ s H  u  u  nnu  nnUU-  UU-  /PM     snnnn[        R                  U
S9n[        R&                  " [)        U5       Vs/ s H3  nUU   R+                  S5      R-                  [	        UU   5      S5      PM5     sn5      nUUSS2SSS24   -  nUUSS2/ SQ4   -  n/ n [)        U5       HQ  n!UU!   n"UU!   n#[        R.                  " XS9n$Ub  U#U:  n%U"U%   n"U#U%   n#U$U%   n$U R1                  U"U#U$UU!   S.5        MS     / n&Sn'U H*  n[	        U5      n(U&R1                  U U'U'U(-    5        U'U(-  n'M,     U&$ s  snf s  snnf s  snnnnf s  snf )a	  
Converts the output of [`Sapiens2ForPoseEstimation`] into keypoint predictions in image space.

Args:
    outputs (`Sapiens2PoseEstimatorOutput`):
        Raw outputs of the model. `outputs.heatmaps` must have shape
        `(N_total, num_keypoints, heatmap_height, heatmap_width)` where
        `N_total = sum(len(b) for b in boxes)`.
    boxes (`list[list[list[float]]]` or `np.ndarray`):
        List or array of bounding boxes for each image in absolute pixel coordinates. Each box
        should be a list of 4 floats representing the bounding box coordinates in COCO format
        (top_left_x, top_left_y, width, height). Must match the `boxes` argument passed to
        `preprocess`.
    outputs_flipped (`Sapiens2PoseEstimatorOutput`, *optional*):
        Outputs from running the model on horizontally flipped inputs. When provided, heatmaps
        are averaged with `outputs` before keypoint extraction to improve accuracy:
        `avg_heatmaps = (outputs.heatmaps + outputs_flipped.heatmaps) / 2`.
    kernel_size (`int`, *optional*, defaults to 11):
        Kernel size for the Gaussian blur used in UDP Dark Pose refinement.
    threshold (`float`, *optional*):
        Score threshold. Keypoints with scores at or below this value are
        filtered out from the result dictionaries.
    source_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Original `(height, width)` of each image in pixels. Required when `target_sizes` is
        provided, as the source coordinate space for scaling keypoints and bounding boxes.
    target_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Desired output `(height, width)` coordinate space for each image. When provided
        alongside `source_sizes`, keypoint coordinates and bounding boxes are scaled from
        source to target space.

Returns:
    `list[list[dict]]`: Outer list is over images, inner list is over persons.
    Each dict contains:
    - `keypoints` (`torch.FloatTensor` of shape `(num_keypoints, 2)`): absolut x/y coordinates in
      the source image space, or in target space if `target_sizes` is provided.
    - `scores` (`torch.FloatTensor` of shape `(num_keypoints,)`): per-keypoint confidence.
    - `labels` (`torch.LongTensor` of shape `(num_keypoints,)`): keypoint indices.
    - `bbox` (`torch.FloatTensor` of shape `(4,)`): bounding box in absolute (x_min, y_min, x_max, y_max)
       format, in the same coordinate space as `keypoints`.
NzA`source_sizes` must be provided when `target_sizes` is specified.zHMake sure that you pass in as many source sizes as the number of images.zHMake sure that you pass in as many target sizes as the number of images.r&   r   r}   )r   r   r   rp   ro   )rc   r{   r   )r   r{   r   r{   r   )r   r   labelsbbox)
isinstancerU   Tensortolistlenr   r   r   r   r&  r   r   r   r   rx   r8   r  r9   r$  r   ranger   r   r   append))r  r8  rb   r9  r   r:  r;  r<  
num_imagesr   r   num_total_personsr   r   r   r   r-  boxr.  all_keypoints
all_scorescentersr]   heatmap_size	all_boxessource_heightsource_widthtarget_heighttarget_widthper_image_scaleimage_indexper_person_scaleperson_resultsperson_indexr   r   r>  keepr   person_offsetnum_persons_in_images)                                            rI   post_process_pose_estimation3Sapiens2ImageProcessor.post_process_pose_estimation  s   d lELL11'..0LlELL11'..0LZ
#(<`aa#
c,6G(Gghh#
c,6G(Gghh##& #;#;;q@HJR..G.! %&1B&& ||$)AE[[cS[SEA_e
 >># %=X$F!z:#h

 /|,499X;NPTPYPYZaPb:c
 ||]Q%68J$KSXS`S`iopL(6!T1*+==4QR
@SSVY\bcdfjlmcm\nVnn 	 %\2	#(@#ll Y\\hXwXwT57T| "L0--2OPXw mmO  %yy (-Z'8'8 $K0::1=DDS{I[E\^_`'8  *,<QaZ,HHM!$4Q_$EEI!"34L%l3I-F\\-?F$	)%dO	!!'6VU^_kUlm 5   K#&{#3 MM.I]9]^_11M ! I ' B0s   :M8M=
0N:Ndo_remove_paddingc                     [         R                  " UR                  SSSS9nU R                  XRX4S9nU Vs/ s H  nSU0PM	     sn$ s  snf )a  
Converts the output of [`Sapiens2ForNormalEstimation`] into L2-normalized surface normal maps.

Args:
    outputs (`Sapiens2NormalEstimatorOutput`):
        Raw outputs of the model.
    source_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Original `(height, width)` of each image before preprocessing. When provided,
        the padding added during preprocessing is removed and predictions are resized back
        to the original image size (unless `target_sizes` overrides the final size).
    target_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Requested final `(height, width)` for each prediction. When provided, used as the
        resize target instead of `source_sizes`. Resized with bilinear interpolation after
        L2 normalization.
    do_remove_padding (`bool`, *optional*):
        Whether to crop away the zero-padding added during preprocessing before resizing.
        Defaults to `True` when `source_sizes` is provided, `False` otherwise.

Returns:
    `list[dict[str, torch.Tensor]]` of length `batch_size`. Each dict has a `"normals"` key
    mapping to a tensor of shape `(3, height, width)` with L2-normalized unit vectors in
    `[-1, 1]` per channel (XYZ surface normals).
r&   r{   g:0yE>)pri   r   mapsr;  r<  r\  rR   )r   	normalizerR   _post_process_maps)r  r8  r;  r<  r\  rR   resultsr   s           rI   post_process_normal_estimation5Sapiens2ImageProcessor.post_process_normal_estimation  sU    < ++gootD)), * 
 3::'F#':::s   Ac                     UR                   nUR                  b  XQR                  SS2SS2SS4   -  nU R                  XRX4S9nU Vs/ s H  nSU0PM	     sn$ s  snf )a  
Converts the output of [`Sapiens2ForPointmapEstimation`] into pointmap tensors in image space.

Args:
    outputs (`Sapiens2PointmapEstimatorOutput`):
        Raw outputs of the model.
    source_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Original `(height, width)` of each image before preprocessing. When provided,
        the padding added during preprocessing is removed and predictions are resized back
        to the original image size (unless `target_sizes` overrides the final size).
    target_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Requested final `(height, width)` for each prediction. Overrides `source_sizes`
        as the resize target.
    do_remove_padding (`bool`, *optional*):
        Whether to crop away the zero-padding added during preprocessing before resizing.
        Defaults to `True` when `source_sizes` is provided, `False` otherwise.

Returns:
    `list[dict[str, torch.Tensor]]` of length `batch_size`. Each dict has a `"pointmap"` key
    mapping to a tensor of shape `(3, height, width)` with per-pixel 3D XYZ coordinates in
    canonical camera space, optionally divided by `outputs.scales` to convert to metric coordinates.
Nr_  pointmap)r\   r]   rb  )r  r8  r;  r<  r\  r\   rc  r   s           rI    post_process_pointmap_estimation7Sapiens2ImageProcessor.post_process_pointmap_estimation  sl    : %%	>>%!NN1at3C$DDI))L * 
 4;;7V$7;;;s   Abackgroundsc           	        ^^ [        T[        R                  5      (       a  TR                  5       mUR                  R
                  S   nUR                  R                  nUR                  R                  nTb  U[        T5      :w  a  [        S5      eTSL =(       d    [        U4S jT 5       5      n/ mUb~  [        U5      n[        U5      S:w  a  [        U5      U:w  a  [        S5      eU V	s/ s H<  n	[        R                  " [        R                  " U	5      USS9R                  U5      PM>     sn	mT(       + =(       d    [        U4S	 jT 5       5      n
[        R                   " UR                  UR"                  /SS
9nTb?  U(       a8  [%        TS   5      n[&        R(                  " UUSSSS9nUR+                  SS5      n/ nU(       a  U
(       a  USS2SS24   nUSS2SS24   nS/U-  nT(       a  [        R,                  " T5      nUR
                  SS UR
                  SS :w  a%  [&        R(                  " UUR
                  SS SSSS9nUSU-
  U-  -   R+                  SS5      n[        R                  " U[        R.                  SS9n[1        XU5       H  u  nnnUR3                  UUUS.5        M     U$ [5        [        U5      5       GH-  nUU   nT(       aF  U(       d?  [&        R(                  " UR7                  S5      TU   SSSS9S   nUR+                  SS5      nUSS nUSS nSnT(       a  [        T5      S:X  a  TS   OTU   nUR
                  SS UR
                  SS :w  a7  [&        R(                  " UR7                  S5      UR
                  SS SSSS9S   nUSU-
  U-  -   R+                  SS5      n[        R                  " U[        R.                  SS9nUR3                  UUUS.5        GM0     U$ s  sn	f )a`  
Converts the output of [`Sapiens2ForImageMatting`] into alpha mattes and foreground maps.

Args:
    outputs (`Sapiens2ImageMattingOutput`):
        Raw outputs of the model.
    target_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Requested final `(height, width)` for each prediction. Resized with bilinear
        interpolation. If unset, predictions are returned at the model output resolution.
    backgrounds (`ImageInput`, *optional*):
        Background image(s) to composite over. Can be a single image (applied to every item
        in the batch) or a list of images, one per batch item. Accepts PIL images, numpy
        arrays, or torch tensors of any dtype; integer types (e.g. uint8) are scaled to
        `[0, 1]` automatically. When provided, each result dict gains a `"composite"` key
        with the composited image as a uint8 tensor in `[0, 255]`.

Returns:
    `list[dict]` of length `batch_size`. Each dict has:
    - `"alpha"` (`torch.Tensor` of shape `(1, height, width)`): alpha values in `[0, 1]`.
    - `"foreground"` (`torch.Tensor` of shape `(3, height, width)`): pre-multiplied RGB in `[0, 1]`.
    - `"composite"` (`torch.Tensor` of shape `(3, height, width)` or `None`): foreground composited
      over `backgrounds` as a uint8 tensor in `[0, 255]`; `None` when `backgrounds` is not provided.
r   Nz\Make sure that you pass in as many target sizes as the batch dimension of the matting outputc              3   X   >#    U  H  n[        U5      [        TS    5      :H  v   M!     g7fr   N)rX   ).0r  r<  s     rI   	<genexpr>DSapiens2ImageProcessor.post_process_image_matting.<locals>.<genexpr>  s&      =
>JdE$K5a11ls   '*r{   z[Make sure that you pass in as many backgrounds as the batch dimension of the matting outputTr!  c              3   f   >#    U  H&  oR                   S S TS   R                   S S :H  v   M(     g7f)Nr   )r   )rn  
backgroundbackground_tensorss     rI   ro  rp    s9      C
Xj*RS!%7%:%@%@%EEXjs   .1rh   r   Fr  r   r   	antialiasr   r|   r   rr  )
foregroundalpha	composite)r@  rU   rA  rB  ra   r   r   r~   rC  r   allr   r   r%  to_imagetor   alphasrX   r   interpolater   rk   uint8r$  rE  rD  r   )r  r8  r<  rj  
batch_sizer   r~   all_target_sizes_equalbackground_listbackground_imageall_background_sizes_equalmattingtarget_sizer   ra   r}  
compositesrs  rw  rx  ry  r   matting_itemrt  s     `                    @rI   post_process_image_matting1Sapiens2ImageProcessor.post_process_image_matting  s4   : lELL11'..0L((..q1
$$++##))#S.. r  ".!5 "
 =
>J=
 :
  "1+>O?#q(S-AZ-O q 
 )8"(7$ ""3<<0@#AVZ[^^_ef(7" *<%; &
s C
XjC
 @
" ))W00'..AqI#(>Q0Kmm #G mmC-G!&@!!RaR%.KQU^F*,J!"[[);<
##BC(GMM"#,>>!""$]]23/'&+"'"J *QZ:,EELLSRUV
 //
%++UYZ
03K0T,
E9&0!&%. 1UV A s7|,&u~(>#$==$..q1)%0'&+"'$ $L $0#5#5c3#?L)"1-
$QR( 	%:=>P:QUV:V!3A!6\not\uJ!'',0B0B230GG%&]]&003!-!3!3BC!8!+*/&+& &
 ",q5yJ.F!F M McSV WI # 2 29EKKW[ \IZ%V_`a= -@ g"s   1AP
r`  c                   ^^ [        U[        R                  5      (       a  UR                  5       n[        U[        R                  5      (       a  UR                  5       nUc  US LnU(       a  Uc  [	        S5      eUb#  [        U5      [        U5      :w  a  [	        S5      eUb#  [        U5      [        U5      :w  a  [	        S5      eU R                  S   nU R                  S   n/ mU(       ac  U H]  u  px[        Xx4XV5      u  pX:  a  XY-
  S-  OSnX:  a  Xj-
  S-  OSnTR                  UUU[        X5      -   U[        X5      -   45        M_     T(       + =(       d    [        U4S jT 5       5      n/ mUb  U Vs/ s H  n[        U5      PM     snmOUb  U Vs/ s H  n[        U5      PM     snmT(       + =(       d    [        U4S	 jT 5       5      n/ nU(       aX  U(       aQ  U(       a  TS   u  nnnnUS S 2S S 2UU2UU24   nT(       a  [        R                  " UTS   S
SSS9n[        U5      nU$ [        [        U5      5       Hl  nUU   nU(       a  TU   u  nnnnUS S 2UU2UU24   nT(       a-  [        R                  " UR!                  S5      TU   S
SSS9S   nUR                  U5        Mn     U$ s  snf s  snf )Nz>`source_sizes` must be provided when `do_remove_padding=True`.zUMake sure that you pass in as many source sizes as the batch dimension of the outputszUMake sure that you pass in as many target sizes as the batch dimension of the outputsrp   ro   r&   r   c              3   2   >#    U  H  oTS    :H  v   M     g7frm  rA   )rn  cropr,  s     rI   ro  <Sapiens2ImageProcessor._post_process_maps.<locals>.<genexpr>  s     *N58+;   c              3   2   >#    U  H  oTS    :H  v   M     g7frm  rA   )rn  r  final_sizess     rI   ro  r    s     6fZeRV{1~7MZer  r   Fru  )r@  rU   rA  rB  r   rC  r  r   rE  minrz  rX   r   r~  listrD  r   )r  r`  r;  r<  r\  model_heightmodel_widthoriginal_heightoriginal_width
new_height	new_widthpad_toppad_leftall_crops_equalr  all_final_sizes_equalr   topleftbottomrightr   map_itemr,  r  s                          @@rI   rb  )Sapiens2ImageProcessor._post_process_mapso  s    lELL11'..0LlELL11'..0L$ ,D 8!5]^^#D	S5F(Ftuu#D	S5F(Ftuuyy*ii(3?/(K$5|)%
 ?I>W<4:]^=F=TK39Z[ #j"?? 3y#>>	 4@ $)Ns*N*N'N#3?@<45;<@K%3?@<45;<@K$/ f36fZe6f3f4 +08(T65Aq#f*d5j89}}$Q#"'# $ZF* % s4y);$/4U|,Cvu'3v:tEz(ABH }} **1-(/'&+"'   H h'! *$ U A@s   ?KKrA   NNN)FFN)N   NNN)NNN)/rC   rD   rE   rF   r   valid_kwargsr   BILINEARr  r   r  r   r  r  r  r    r  r#   r   r  r   r   r  boolr   strr!   r   r  rU   rA  r   r6  rK   intrX   dictrZ  rO   rd  rZ   rh  r_   r  rb  rG   __classcell__r  s   @rI   r   r     s
   /L!**H&J$IS)DF#(D!E #  0404	YY &,Y De%&-	Y
 56Y 
Y Y4 59

 &,
 De%&-	

 
 ,
 j(4/
 c>)*T1
 

 
L "'04!8NU\\"8N 8N 	8N
 L8N 8N 8N 8N 8N 8N DK'$.8N 4;&-8N +8N 8N 8N  De%&-!8N$ 
ell	%8N| ?C"&BFBFL,L De%&L 5t;	L
 L 4<L !4c3h#884?L !4c3h#884?L 
d4U\\)*+	,Lb CGBF)-";."; !4c3h#884?"; !4c3h#884?	";
  $;"; 
d3$%	&";N CGBF)-#<0#< !4c3h#884?#< !4c3h#884?	#<
  $;#< 
d3$%	&#<P CG)-	G+G !4c3h#884?G  $&	G
 
d3$%	&GRVllV !4c3h#884?V !4c3h#884?	V
  $;V 
ell	V VrH   r   zfacebook/sapiens2-seg-0.4b)
checkpointc                     ^  \ rS rSr% SrSrSrSr\\	   S-  \
S'   Sr\\	   S-  \
S'   Sr\	\
S	'   Sr\S-  \
S
'   Sr\\	   S-  \
S'   Sr\\	   S-  \
S'   Sr\	\
S'   Sr\\	   S-  \
S'   Sr\\	   S-  \
S'   Sr\	\
S'   Sr\	S-  \
S'   Sr\\	   S-  \
S'   U 4S jrS\	\\	   -  \\	\	4   -  S\	\\	   -  \\	\	4   -  SS4S jrSrU =r$ )Sapiens2HeadConfigi  a  
upsample_out_channels (`list[int]`, *optional*):
    Output channel counts for each upsample block.
    The first block takes `hidden_size` channels as input; subsequent blocks use the previous output.
upsample_kernel_sizes (`list[int]`, *optional*):
    Kernel size for each upsample block. Auto-filled with `[4, ...]` when
    `upsample_out_channels` is set but this is `None`.
    Must have the same length as `upsample_out_channels`.
upsample_kernel_size (`int`, defaults to 4):
    Default kernel size for upsample blocks when `upsample_kernel_sizes` is not set.
use_pixel_shuffle (`bool`, *optional*):
    Whether the upsample head uses pixel-shuffle upsampling instead of transposed convolutions.
    When `None` (default), the head uses transposed convolutions.
conv_out_channels (`list[int]`, *optional*):
    Output channel counts for the refinement conv layers that follow the upsample blocks.
conv_kernel_sizes (`list[int]`, *optional*):
    Kernel size for each refinement conv layer. Auto-filled with `[1, ...]` when
    `conv_out_channels` is set but this is `None`.
    Must have the same length as `conv_out_channels`.
conv_kernel_size (`int`, defaults to 1):
    Default kernel size for conv layers when `conv_kernel_sizes` is not set.
scale_conv_out_channels (`list[int]`, *optional*):
    Output channel counts for the stride-2 conv layers used to predict the focal-length scale.
    When `None` (default), no scale branch is built.
scale_conv_kernel_sizes (`list[int]`, *optional*):
    Kernel size for each scale conv layer. Auto-filled with `[1, ...]` when
    `scale_conv_out_channels` is set but this is `None`.
    Must have the same length as `scale_conv_out_channels`.
scale_conv_kernel_size (`int`, defaults to 1):
    Default kernel size for scale conv layers when `scale_conv_kernel_sizes` is not set.
scale_final_input_size (`int`, *optional*):
    Flattened feature size passed into the scale MLP.
    When `None` (default), it is automatically inferred from `image_size` and `patch_size`
    in the parent [`Sapiens2Config`].
scale_final_hidden_sizes (`list[int]`, *optional*):
    Hidden-layer sizes for the MLP that maps flattened scale features to the scalar scale output.
    When `None` (default), no scale branch is built.
sapiens2_headhead_configNupsample_out_channelsupsample_kernel_sizes   upsample_kernel_sizeuse_pixel_shuffleconv_out_channelsconv_kernel_sizesr{   conv_kernel_sizescale_conv_out_channelsscale_conv_kernel_sizesscale_conv_kernel_sizescale_final_input_sizescale_final_hidden_sizesc                   > U R                   b5  U R                  c(  U R                  /[        U R                   5      -  U l        U R                  b5  U R
                  c(  U R                  /[        U R                  5      -  U l        U R                  b5  U R                  c(  U R                  /[        U R                  5      -  U l        [        TU ],  " S0 UD6  g r  )r  r  r  rC  r  r  r  r  r  r  r  __post_init__r  s     rI   r   Sapiens2HeadConfig.__post_init__  s    %%1d6P6P6X*.*C*C)Ds4KeKeGf)fD&!!-$2H2H2P&*&;&;%<s4CYCY?Z%ZD"''38T8T8\,0,G,G+H3tOkOkKl+lD(''rH   
image_size
patch_sizere   c                    U R                   c  U R                  b  U R                  c  g [        U[        [
        45      (       a  UOX4u  p4[        U[        5      (       a  UOUS   n[        U[        5      (       a  UOUS   nX5-  nXF-  nU R                   H-  n	U	S-
  S-  n
USU
-  -   U	-
  S-  S-   nUSU
-  -   U	-
  S-  S-   nM/     Xx-  U R                  S   -  U l         g )Nr   r{   r&   rg   )r  r  r  r@  r  rX   r  )r  r  r  image_heightimage_widthpatch_heightpatch_widthfeatures_heightfeatures_widthr   rd   s              rI   _init_scale_final_input_size/Sapiens2HeadConfig._init_scale_final_input_size  s    ''3++3++32<Z$PU2W2WJ^h]u!%/
C%@%@zjQRm$.z3$?$?jZPQ]&6$377K"Q1,G.W<{JqPSTTO,q7{:[HQNQRRN 8 '6&FIeIefhIi&i#rH   )r  r  r  r  )rC   rD   rE   rF   rM   
model_typebase_config_keyr  r  r  rW   r  r  r  r  r  r  r  r  r  r  r  r  r  rX   r  rG   r  r  s   @rI   r  r    s<   %N !J#O.249t+2.249t+2 !#!%)td{)*.tCy4'.*.tCy4'.c04T#Y-404T#Y-4"#C#)-C$J-15d3i$.5(jS	/E#s(O;jILtTWy[`adfiai[jIjj	j jrH   r  zfacebook/sapiens2-pretrain-0.4bc                     ^  \ rS rSr% SrSrS\0rSr\	\
S'   Sr\	\
S'   S	r\	\
S
'   Sr\	\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\	\
S'   Sr\\
S'   Sr\\
S'   Sr\\	   S-  \
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S'   Sr\	\
S '   Sr\\\	      S-  \
S!'   Sr \\!-  S-  \
S'   \"" 5       r#\"" 5       r$U 4S" jr%S#r&U =r'$ )$Sapiens2Configi   uR  
rope_theta (`float`, *optional*, defaults to 100.0):
    The base period of the RoPE embeddings.
query_bias (`bool`, *optional*, defaults to `True`):
    Whether to add a bias to the query projection.
key_bias (`bool`, *optional*, defaults to `False`):
    Whether to add a bias to the key projection.
value_bias (`bool`, *optional*, defaults to `True`):
    Whether to add a bias to the value projection.
proj_bias (`bool`, *optional*, defaults to `True`):
    Whether to add a bias to the output projection.
layerscale_value (`float`, *optional*, defaults to 1.0):
    Initial value to use for layer scale.
use_gated_mlp (`bool`, *optional*, defaults to `False`):
    Whether to use the SwiGLU feedforward neural network.
num_register_tokens (`int`, *optional*, defaults to 0):
    The number of register tokens.
pos_embed_shift (`float`, *optional*):
    Amount to randomly shift position embedding coordinates in [-shift, shift],
    applied only in training mode if not `None`.
pos_embed_jitter (`float`, *optional*):
    Amount to randomly jitter position embedding coordinates in log-uniform value in [1/jitter, jitter],
    applied only in training mode if not `None`.
pos_embed_rescale (`float`, *optional*, defaults to 2.0):
    Amount to randomly rescale position embedding coordinates in log-uniform value in [1/rescale, rescale],
    applied only in training mode if not `None`.
reshape_hidden_states (`bool`, *optional*, defaults to `True`):
    Whether to reshape the hidden states to spatial dimensions when used as backbone.
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether to use a mask token in the embeddings (needed for masked image modeling pretraining).
rms_norm_eps (`float`, *optional*, defaults to 1e-6):
    Epsilon for the RMS normalization layers.
normalize_backbone_outputs (`bool`, *optional*, defaults to `True`):
    Whether to apply RMSNorm to the backbone `feature_maps` and `cls_tokens` outputs before
    returning them from the forward pass. Only applies when the model is used as a backbone.
use_qk_norm (`bool`, *optional*, defaults to `True`):
    Whether to apply RMSNorm to queries and keys before RoPE in attention layers.
num_key_value_heads_per_layer (`list[int]`, *optional*):
    Number of key/value heads for each transformer layer. Setting a layer's value equal to
    `num_attention_heads` gives full multi-head attention; a smaller value gives grouped-query
    attention. Defaults to `num_attention_heads` for the first `num_first_full_attention_layers`
    and last `num_last_full_attention_layers` layers and `num_key_valueattention_heads` for all other
    layers.
num_key_value_attention_heads (`int`):
    Number of key/value heads for layers that use grouped-query attention when `num_key_value_heads_per_layer`
    is not set. Ignored when `num_key_value_heads_per_layer` is set.
num_first_full_attention_layers (`int`, *optional*, defaults to 8):
    Number of leading transformer layers that use full multi-head attention.
    Only used when `num_key_value_heads_per_layer` is `None`.
num_last_full_attention_layers (`int`, *optional*, defaults to 8):
    Number of trailing transformer layers that use full multi-head attention.
    Only used when `num_key_value_heads_per_layer` is `None`.
semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
    Label index ignored when computing the segmentation loss.
flip_pairs (`list[list[int]]`, *optional*):
    Pairs of keypoint indices that are mirrored horizontally (e.g., left ear ↔ right ear).
    Each pair is a two-element list `[left_index, right_index]`. Used for test-time
    horizontal flip augmentation in pose estimation: pass these pairs to the second
    forward call so the model flips heatmaps back before returning them.
head_config (`Sapiens2HeadConfig`, *optional*):
    Configuration for the decode head. See [`Sapiens2HeadConfig`] for the available options.
sapiens2r  r   hidden_size   num_hidden_layers   num_attention_headsi   intermediate_sizeFuse_mask_tokenTuse_gated_mlpsilu
hidden_actgư>rms_norm_epsnormalize_backbone_outputs   num_register_tokenskey_biasuse_qk_normNnum_key_value_heads_per_layernum_key_value_attention_headsnum_first_full_attention_layersnum_last_full_attention_layers   semantic_loss_ignore_index
flip_pairsc                 
  > U R                   cl  [        U R                  5       Vs/ s HF  nX R                  :  d  X R                  U R                  -
  :  a  U R
                  OU R                  PMH     snU l         [        U R                  [        5      (       a  [        S0 U R                  D6U l        U R                  b.  U R                  R                  U R                  U R                  S9  [        TU ]<  " S0 UD6  g s  snf )N)r  r  rA   )r  rD  r  r  r  r  r  r@  r  r  r  r  r  r  r  r  )r  r   layer_indexr  s      rI   r  Sapiens2Config.__post_init__|  s    --5 $))?)?#@2 $AK	  "F"FF"&<&<t?b?b&bb ((
 778 $A2D. d&&--1ED4D4DED'99T__aeapap9q''2s   AD )r  r  )(rC   rD   rE   rF   rM   r  r  sub_configsr  r  rW   r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  AttributeErrorlayer_norm_epsapply_layernormr  rG   r  r  s   @rI   r  r     s   =~ J "45KKs!!!s! ND M4JL%'++  HdK6:!49t#3:)*!3*+,#S,*+"C+&)))-JT#Y$&-48K#d*T18#%N$&O( (rH   r  c                      ^  \ rS rSrS\4U 4S jjrS
S\R                  S\R                  S-  S\R                  4U 4S jjjrS	r	U =r
$ )Sapiens2Embeddingsi  configc                    > [         TU ]  U5        UR                  (       a<  [        R                  " [
        R                  " SSUR                  5      5      U l        g S U l        g )Nr{   )	r  r  r  r   	ParameterrU   r   r  
mask_tokenr  r  r  s     rI   r  Sapiens2Embeddings.__init__  sB     QWQfQf",,u{{1a9K9K'LMlprH   Npixel_valuesbool_masked_posre   c                 X   > Ub  U R                   c  [        S5      e[        TU ]  X5      $ )Nz:bool_masked_pos requires use_mask_token=True in the config)r  r   r  forward)r  r  r  r  s      rI   r  Sapiens2Embeddings.forward  s-    &4??+BYZZw|==rH   )r  r  rC   rD   rE   rF   r  r  rU   rA  r  rG   r  r  s   @rI   r  r    sE    q~ q>ELL >5<<RVCV >bgbnbn > >rH   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Sapiens2RopePositionEmbeddingi  r  c                 (  > [         TU ]  U 5        U ?U ?UR                  n[        U[        5      (       a  UOX"4u  p4UR                  n[        U[        5      (       a  UOUS   n[        U[        5      (       a  UOUS   nX6-  U l        XG-  U l        g )Nr   r{   )	r  r  num_patches_hnum_patches_wr  r@  r   r  r  )	r  r  r  image_himage_wr  patch_size_hpatch_size_wr  s	           rI   r  &Sapiens2RopePositionEmbedding.__init__  s    &&
)3J)I)I:PZOg&&
%/
C%@%@zjQRm%/
C%@%@zjQRm$4$4rH   )r  r  rC   rD   rE   rF   r  r  rG   r  r  s   @rI   r  r    s    5~ 5 5rH   r  c                       \ rS rSrSrg)Sapiens2RMSNormi  rA   NrB   rA   rH   rI   r  r    r   rH   r  c                      ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\R                  S-  S\	\R                  \R                  4   S-  S	\
\   S
\	\R                  \R                  S-  4   4
S jjrSrU =r$ )Sapiens2Attentioni  r  	layer_idxc                   > [         TU ]  U5        U ?U ?UR                  U   U l        U R                  U R
                  -  U l        [        R                  " U R                  U R
                  U R                  -  UR                  S9U l        [        R                  " U R                  U R
                  U R                  -  UR                  S9U l        UR                  (       a  [        U R                  UR                   S9O[        R"                  " 5       U l        UR                  (       a$  [        U R                  UR                   S9U l        g [        R"                  " 5       U l        g )N)biasr   )r  r  k_projv_projr  num_key_value_heads	num_headsnum_key_value_groupsr   Linear	embed_dimhead_dimr  
value_biasr  r  r  Identityq_normk_normr  r  r  r  s      rI   r  Sapiens2Attention.__init__  s     KK#)#G#G	#R $(NNd6N6N$N!ii0H0H4==0X_e_n_noii0H0H4==0X_e_p_pqQWQcQcodmm9L9LMikititivQWQcQcodmm9L9LMikititivrH   NrS   attention_maskposition_embeddingsr   re   c                 
   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      nU R                  U5      nUu  p[        XxX5      u  px[        R                  " U R                  R                  [        5      nU" U UUU	U4U R                  (       d  SOU R                   U R"                  S.UD6u  pUR$                  " / UQSP76 R'                  5       nU R)                  U5      nX4$ )Nrg   r{   r&   r   )dropoutscaling)r   r  q_projview	transposer  r  r  r  r3   r   get_interfacer  _attn_implementationr4   trainingr$  r%  r   
contiguouso_proj)r  rS   r!  r"  r   input_shapehidden_shapequery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  rI   r  Sapiens2Attention.forward  sv    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&#7RU#[ (?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
! "));;;;FFHkk+.((rH   )r  r  r  r  r  r  r  )rC   rD   rE   rF   r  r  r  rU   rA  rX   r    r"   r  rG   r  r  s   @rI   r  r    s    	w~ 	w# 	w /3HL	#)||#) t+#) #5<<#=>E	#)
 +,#) 
u||U\\D00	1#) #)rH   r  c                       \ rS rSrSrg)Sapiens2LayerScalei  rA   NrB   rA   rH   rI   r:  r:    r   rH   r:  c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )Sapiens2Layeri  r  r  c                    > [         TU ]  U5        [        XS9U l        [	        UR
                  UR                  S9U l        [	        UR
                  UR                  S9U l        [        R                  " 5       U l        g )Nr  r  )r  r  r  	attentionr  r  r  norm1norm2r   r  layer_scale2r  s      rI   r  Sapiens2Layer.__init__  s^     *6G$V%7%7V=P=PQ
$V%7%7V=P=PQ
KKMrH   )r?  rB  r@  rA  )	rC   rD   rE   rF   r  r  r  rG   r  r  s   @rI   r<  r<    s    *~ *# * *rH   r<  c                      ^  \ rS rSr         SS\S\S\\\\4   -  S\S\\\\4   -  \-  S\S\S	\S
\S\S\4U 4S jjjrS\	R                  S\	R                  4S jrSrU =r$ )Sapiens2ConvLayeri  in_channelsout_channelsr   striderd   groups
activationr  convolution_transposepixel_shufflescale_factorc           
        > [         TU ]  5         U	(       a.  [        R                  " UU
(       a  X+S-  -  OUUUUUUS9U l        O-[        R
                  " UU
(       a  X+S-  -  OUUUUUUS9U l        U
(       a  [        R                  " U5      O[        R                  " 5       U l        [        R                  " U5      U l
        [        U   U l        g )Nr&   )r   rH  rd   r  rI  )r  r  r   ConvTranspose2dconvolutionConv2dPixelShuffler  rL  InstanceNorm2dnormr   act_fn)r  rF  rG  r   rH  rd   rI  rJ  r  rK  rL  rM  r  s               rI   r  Sapiens2ConvLayer.__init__  s     	 !112?Q.\' D  "yy2?Q.\' D ?LR__\:QSQ\Q\Q^%%l3	Z(rH   rS   re   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r  )rP  rL  rT  rU  r  rS   s     rI   r  Sapiens2ConvLayer.forward  sD    ((7**=9		-0M2rH   )rU  rP  rT  rL  )	r{   r{   r   r{   r  TFFr&   )rC   rD   rE   rF   r  rX   r  r  r  rU   rA  r  rG   r  r  s   @rI   rE  rE    s    
 .//0 &+#%)%) %) 5c?*	%)
 %) uS#X&,%) %) %) %)  $%) %) %) %)NU\\ ell  rH   rE  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Sapiens2Headi  r  c                   >^ [         TU ]  5         TR                  R                  (       a   [	        TR
                  TR
                  SSS9O[        R                  " 5       U l        TR
                  /TR                  R                  S S -   n[        R                  " U4S j[        UTR                  R                  TR                  R                  5       5       5      U l        TR                  R                  S   /TR                  R                  S S -   n[        R                  " U4S j[        UTR                  R                  TR                  R                  5       5       5      U l        TR                  R                  (       a  TR                  R                  S   O?TR                  R                  (       a  TR                  R                  S   OTR
                  n[        R"                  " UTR$                  SS9U l        g )Nr   r{   r   rd   rg   c              3   p  >#    U  H  u  pn[        UUUTR                  R                  (       a  S OSTR                  R                  (       a  US -
  S-  OS [        TR                  R                  5      [        TR                  R                  5      TR                  R                  (       + S9v   M     g7f)r{   r&   )r   rH  rd   r  rL  rK  N)rE  r  r  r  rn  in_chout_chr   r  s       rI   ro  (Sapiens2Head.__init__.<locals>.<genexpr>$  s      -
/*{ '"..@@qa282D2D2V2VqQ.\]&,,>>?"6#5#5#G#GH*0*<*<*N*N&N	/s   B3B6c              3      >#    U  H7  u  pn[        UUUTR                  R                  (       a  US -
  S-  OSS9v   M9     g7f)r{   r&   r   r]  N)rE  r  r  r_  s       rI   ro  rb  6  sL      
)
/*{ '282D2D2V2VqQ.\]	/s   ?A)r   )r  r  r  r  rE  r  r   r  
input_convr  
ModuleListr$  r  upsample_layersr  r  conv_layersrQ  
num_labels	predictor)r  r  upsample_in_channelsconv_in_channelspredictor_inr  s    `   rI   r  Sapiens2Head.__init__  s    !!33 f00&2D2DRS]^_ 	
 !' 2 23f6H6H6^6^_b`b6cc!}} -
 /2$""88""88/-
  
" #..DDRHIFL^L^LpLpqtrtLuu== 
)
 /2 &"4"4"F"FHZHZHlHl/
)
 

 !!33 004 !!77 ##99"=## 	 <1B1BPQRrH   rS   re   c                     U R                  U5      nU R                   H  nU" U5      nM     U R                   H  nU" U5      nM     U R                  U5      $ r  )rd  rf  rg  ri  r  rS   layers      rI   r  Sapiens2Head.forwardJ  sS    6))E!-0M *%%E!-0M &~~m,,rH   )rg  rd  ri  rf  r  r  s   @rI   r[  r[    s2    ,S~ ,S\-U\\ -ell - -rH   r[  c                   B    \ rS rSrS\S\S\R                  SS4S jrSrg)	Sapiens2PointmapFinalLayerBlockiS  in_dimout_dimrJ  re   Nc                     [         R                  R                  U 5        [         R                  " [         R                  " X5      U/5      U l        g r  )r   Moduler  re  r  layers)r  rt  ru  rJ  s       rI   r  (Sapiens2PointmapFinalLayerBlock.__init__T  s2    
		4 mmRYYv%?$LMrH   )rx  )	rC   rD   rE   rF   r  r   rw  r  rG   rA   rH   rI   rs  rs  S  s,    Ns NS Nbii ND NrH   rs  c            	          ^  \ rS rSrSS\S\\\4   S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )Sapiens2PointmapFinalLayeriY  rt  hidden_sizesru  rJ  c                    > [         TU ]  5         [        R                  " 5       U l        [        XS   [        U   S9U l        [        US   US   [        U   S9U l        [        R                  " US   U5      U l
        g )Nr   )rt  ru  rJ  r{   )r  r  r   Flattenr   rs  r   block1block2r  proj)r  rt  r|  ru  rJ  r  s        rI   r  #Sapiens2PointmapFinalLayer.__init__Z  so    zz|5?vj?Q
 6?LOzHZ
 IIl1ow7	rH   rS   re   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      $ r  )r   r  r  r  rX  s     rI   r  "Sapiens2PointmapFinalLayer.forwarde  s;    ]3M2M2yy''rH   )r  r  r   r  )r{   r  )rC   rD   rE   rF   r  rX   r  r  rU   rA  r  rG   r  r  s   @rI   r{  r{  Y  sR    	8s 	8%S/ 	8C 	8ad 	8 	8(U\\ (ell ( (rH   r{  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Sapiens2PointmapScaleHeadil  r  c                   > [         TU ]  5         [        R                  " 5       U l        UR
                  /UR                  R                  S S -   n[        UUR                  R                  UR                  R                  5       H1  u  p4nU R                  R                  [        X4USUS-
  S-  S95        M3     [        UR                  R                  UR                  R                  UR                  S9U l        g )Nrg   r&   r{   )r   rH  rd   )rJ  )r  r  r   re  rg  r  r  r  r$  r  rE  rE  r{  r  r  r  ri  )r  r  scale_in_channelsr`  ra  r   r  s         rI   r  "Sapiens2PointmapScaleHead.__init__m  s    ==?#//063E3E3]3]^a_a3bb*-6666+
&E;
 ##!%[QR]hkl]lqr\rs+
 45577((
rH   rS   re   c                 Z    U R                    H  nU" U5      nM     U R                  U5      $ r  rg  ri  ro  s      rI   r  !Sapiens2PointmapScaleHead.forward  s+    %%E!-0M &~~m,,rH   r  r  r  s   @rI   r  r  l  s/    
~ 
$-U\\ -ell - -rH   r  c                   V    \ rS rSrSrS/rS/r\R                  " 5       SS j5       r	Sr
g)	Sapiens2PreTrainedModeli  modelperiodsr  Nc           	         [         R                  " X5        [        U[        R                  [        R
                  45      (       a5  [        R                  " UR                  SU R                  R                  S9  g [        U[        R                  5      (       a!  [        R                  " UR                  SSS9  g [        U[        5      (       a  [        R                  " UR                  SU R                  R                  S9  UR                  R                  S:  a4  [        R                  " UR                   SU R                  R                  S9  UR                  R"                  (       a!  [        R$                  " UR&                  5        g g [        U[(        5      (       a6  [        R*                  " UR,                  U R                  R.                  5        g [        U[0        5      (       ad  SUR2                  [4        R6                  " SSSUR8                  -  [4        R:                  S	9-  -  n[        R<                  " UR>                  U5        g [        U[@        [B        45      (       a  URE                  5        H  n[        U[        R
                  5      (       a"  [        R                  " UR                  SSS9  MD  [        U[        R                  5      (       d  Me  [        R                  " UR                  S
SS9  M     g g )Nr   )meanstdfan_outrelu)r   nonlinearityr   r{   r  )r~   fan_inlinear)#r   _init_weightsr@  r   r  rQ  inittrunc_normal_weightr  initializer_rangerO  kaiming_normal_r  	cls_tokenr  register_tokensr  zeros_r  r:  	constant_lambda1layerscale_valuer  baserU   r   r  r   copy_inv_freqr[  r  modules)r  moduler  head_modules       rI   r  %Sapiens2PreTrainedModel._init_weights  s   %%d3fryy"))455v}}3DKK<Y<YZ 2 233  YVT 233v//ct{{?\?\]}}0014""6#9#9IfIfg}}++F--. , 233NN6>>4;;+G+GH =>>6;;%,,q!Q=PX]XeXe*fffHJJv1/H IJJ%~~/k29955((););)Z`aRYY77((););(Yab	  0 KrH   rA   )re   N)rC   rD   rE   rF   base_model_prefix"_keys_to_ignore_on_load_unexpected_keys_to_ignore_on_load_missingrU   no_gradr  rG   rA   rH   rI   r  r    s4     +5&'4o#
]]_c crH   r  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Sapiens2Encoderi  r  c           
         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        XS9PM     sn5      U l        g s  snf )Nr>  )r  r  r   re  rD  r  r<  rp  r  s      rI   r  Sapiens2Encoder.__init__  sG     ]]INvOgOgIhiIhI]67Ihi

is   A)rp  r
  r  s   @rI   r  r    s    
~ 
 
rH   r  c            	          ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\
4U 4S	 jjjrS
rU =r$ )Sapiens2Modeli  r  c                 j   > [         TU ]  U5        [        UR                  UR                  S9U l        g Nr  r  r  r  r  r  rT  r  s     rI   r  Sapiens2Model.__init__  *     #F$6$6F<O<OP	rH   Nr  r  r   re   c                 *   > [         TU ]  " U4SU0UD6$ )a@  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
    pre-training.

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-pretrain-0.4b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-pretrain-0.4b")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> cls_token = outputs.pooler_output
>>> cls_token.shape
torch.Size([1, 1024])
```
r  )r  r  )r  r  r  r   r  s       rI   r  Sapiens2Model.forward  s    > w|W_WPVWWrH   rT  r  )rC   rD   rE   rF   r  r  rU   rA  r    r"   r   r  rG   r  r  s   @rI   r  r    s_    Q~ Q 04XllX ,X +,	X
 
$X XrH   r  c                   `   ^  \ rS rSrS\4U 4S jjrS\R                  S\\	   S\
4S jrSrU =r$ )	Sapiens2Backbonei  r  c                 j   > [         TU ]  U5        [        UR                  UR                  S9U l        g r  r  r  s     rI   r  Sapiens2Backbone.__init__  r  rH   r  r   re   c                    UR                  U R                  R                  R                  R                  5      nU R                  U5      nU R                  U5      nSUS'   U R                  " X440 UD6nUR                  nUR                  u  pxpU R                  R                  n[        U[        5      (       a  UOUS   n[        U[        5      (       a  UOUS   nX-  nX-  nS[        U R                  SS5      -   n[        U R                  SS5      n/ / nn[        [        U R                   U5      5       H  u  nu  nnU R                  R"                  (       a  U R%                  U5      nUU R&                  ;   d  MG  U(       a  UR)                  USS2SSS24   5        USS2US2SS24   nU R                  R*                  (       aA  UR-                  X~UUR                  S	   5      R/                  SS
SS5      R1                  5       nOUnUR)                  U5        M     [3        [5        U5      U(       a  [5        U5      OSUR                  UR6                  S9$ )a  
Example:

```python
>>> from transformers import AutoBackbone, AutoImageProcessor
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-pretrain-0.4b")
>>> model = AutoBackbone.from_pretrained("facebook/sapiens2-pretrain-0.4b")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs, return_class_token=True)

>>> outputs.feature_maps[0].shape
torch.Size([1, 1024, 64, 48])
>>> outputs.cls_tokens[0].shape
torch.Size([1, 1024])
```
Toutput_hidden_statesr   r{   r  return_class_tokenFNrg   r   r&   )feature_maps
cls_tokensrS   rT   )r|  
embeddingspatch_embeddingsr  r~   rope_embeddingsr  rS   r   r  r  r@  r  getattr	enumerater$  stage_namesr  rT  out_featuresrE  reshape_hidden_statesr   permuter,  r?   rX   rT   )r  r  r   rS   r"  r   stage_hidden_statesr  r   r  r  r  r  r  num_patches_heightnum_patches_width
num_prefixr  r  r  idx
stage_namehidden_statepatch_tokensfeature_maps                            rI   r  Sapiens2Backbone.forward  s   6 $t'G'G'N'N'T'TU5"22<@)-%&MI&I$223?3E3E0
|[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm)9'7.CQGG
$T[[2FN#%rj/8T=M=MOb9c/d+C+*l{{55#yy6T...%%%l1a7&;<+Az{A,=>;;44$,,ZM^`l`r`rsu`vw Aq!,#   #/K##K0# 0e& &|,,>uZ(D ..((	
 	
rH   r  )rC   rD   rE   rF   r  r  rU   rA  r    r"   r?   r  rG   r  r  s   @rI   r  r    sF    Q~ QF
llF
 +,F
 
 	F
 F
rH   r  c                      ^  \ rS rSrS\4U 4S jjr\\ SS\R                  S\R                  S-  S\\   S\4S	 jj5       5       rS
rU =r$ )Sapiens2ForSemanticSegmentationi$  r  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g r  r  r  rh  r  r  r[  decode_head	post_initr  s     rI   r  (Sapiens2ForSemanticSegmentation.__init__&  @      ++"6*
'/rH   Nr  r>  r   re   c                    Ub%  U R                   R                  S:X  a  [        S5      eU R                  " U40 UD6nUR                  u  pVpxU R                   R
                  n	[        U	[        5      (       a  U	OU	S   n
[        U	[        5      (       a  U	OU	S   nXz-  nX-  nUR                  SS2SU R                   R                  -   S24   nUR                  SS5      R                  USX5      nU R                  U5      nSnUb$  U R                  UX R                   R                  S9n[        UUUR                   UR"                  S9$ )	a`  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss.
    Indices should be in `[0, ..., config.num_labels - 1]`.
    If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-seg-0.4b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-seg-0.4b")

>>> inputs = image_processor(image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> outputs.logits.shape
torch.Size([1, 29, 1024, 768])
```
Nr{   z/The number of labels should be greater than oner   r&   rg   )ignore_index)rQ   logitsrS   rT   )r  rh  r   r  r   r  r@  r  last_hidden_stater  r(  r   r  loss_functionr  r   rS   rT   )r  r  r>  r   r8  r  r   rp   ro   r  r  r  r  r  r  r  r  rQ   s                     rI   r  'Sapiens2ForSemanticSegmentation.forward-  sA   B $++"8"8A"=NOO**\4V4'3'9'9$
v[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm-+00A8W8W4W4Y1YZ",,Q2:::r<e!!+.%%ff;;CiCi%jD&!//))	
 	
rH   r  r  rh  r  )rC   rD   rE   rF   r  r  r%   r#   rU   rV   
LongTensorr    r"   r   r  rG   r  r  s   @rI   r  r  $  sm    ~   +/9
''9
   4'9
 +,	9

 
!9
  9
rH   r  zfacebook/sapiens2-pose-0.4bz
    The Sapiens2 model with a pose estimation head on top (a set of heatmap predictors on top of the hidden states output).
    )r  r=   c                      ^  \ rS rSrS\4U 4S jjr\\  SS\R                  S\R                  S-  S\R                  S-  S\\   S	\4
S
 jj5       5       rSrU =r$ )Sapiens2ForPoseEstimationik  r  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g r  r  r  s     rI   r  "Sapiens2ForPoseEstimation.__init__r  r  rH   Nr  r  r>  r   re   c                 0   U R                   " U40 UD6nUR                  u  pgpU R                  R                  n
[	        U
[
        5      (       a  U
OU
S   n[	        U
[
        5      (       a  U
OU
S   nX-  nX-  nUR                  SS2SU R                  R                  -   S24   nUR                  SS5      R                  USX5      nU R                  U5      nUb  [        UU5      nSnUb  [        S5      e[        UUUR                  UR                  S9$ )av  
flip_pairs (`torch.Tensor` of shape `(num_pairs, 2)`, *optional*):
    Pairs of keypoints which are mirrored (for example, left ear -- right ear), used for
    test-time flip augmentation. When provided, the model assumes `pixel_values` contains
    horizontally-flipped images and calls `flip_back` on the output heatmaps to restore the
    original orientation.
labels (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`, *optional*):
    Heatmap ground truth for computing the loss.

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-pose-0.4b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-pose-0.4b")

>>> boxes = [[[270.8, 0.6, 294.1, 379.5]]]
>>> inputs = image_processor(image, boxes=boxes, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> outputs.heatmaps.shape
torch.Size([1, 308, 256, 192])
```
r   r{   Nr&   rg   Training is not yet supported)rQ   r   rS   rT   )r  r   r  r  r@  r  r  r  r(  r   r  r<   NotImplementedErrorrK   rS   rT   )r  r  r  r>  r   r8  r  r   rp   ro   r  r  r  r  r  r  r  r   rQ   s                      rI   r  !Sapiens2ForPoseEstimation.forwardy  s   L **\4V4'3'9'9$
v[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm-+00A8W8W4W4Y1YZ",,Q2:::r<e##K0! :6H%&EFF*!//))	
 	
rH   r  r  )rC   rD   rE   rF   r  r  r%   r#   rU   rV   rA  r    r"   rK   r  rG   r  r  s   @rI   r  r  k  s    ~   +/+/	=
''=
 LL4'=
 !!D(	=

 +,=
 
%=
  =
rH   r  zfacebook/sapiens2-normal-0.4bz
    The Sapiens2 model with a normal estimation head on top (a PixelShuffle-based decoder that predicts surface normal maps).
    c                      ^  \ rS rSrS\4U 4S jjr\\ SS\R                  S\R                  S-  S\
\   S\4S	 jj5       5       rS
rU =r$ )Sapiens2ForNormalEstimationi  r  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        U R                  5         g r  r  r  s     rI   r  $Sapiens2ForNormalEstimation.__init__  r  rH   Nr  r>  r   re   c                    U R                   " U40 UD6nUR                  u  pVpxU R                  R                  n	[	        U	[
        5      (       a  U	OU	S   n
[	        U	[
        5      (       a  U	OU	S   nXz-  nX-  nUR                  SS2SU R                  R                  -   S24   nUR                  SS5      R                  USX5      nU R                  U5      nSnUb  [        S5      e[        UUUR                  UR                  S9$ )a  
labels (`torch.FloatTensor` of shape `(batch_size, num_labels, height, width)`, *optional*):
    Ground-truth surface normal maps for computing the loss.

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-normal-0.4b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-normal-0.4b")

>>> inputs = image_processor(image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> outputs.normals.shape
torch.Size([1, 3, 1024, 768])
```
r   r{   Nr&   rg   r  )rQ   rR   rS   rT   )r  r   r  r  r@  r  r  r  r(  r   r  r  rO   rS   rT   )r  r  r>  r   r8  r  r   rp   ro   r  r  r  r  r  r  r  rR   rQ   s                     rI   r  #Sapiens2ForNormalEstimation.forward  s
   > **\4V4'3'9'9$
v[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm-+00A8W8W4W4Y1YZ",,Q2:::r<e"";/%&EFF,!//))	
 	
rH   r  r  )rC   rD   rE   rF   r  r  r%   r#   rU   rV   r    r"   rO   r  rG   r  r  s   @rI   r  r    sm    ~   ,04
''4
 !!D(4
 +,	4

 
'4
  4
rH   r  zfacebook/sapiens2-pointmap-0.4bz
    The Sapiens2 model with a pointmap head on top (a PixelShuffle-based decoder that predicts per-pixel 3D XYZ
    coordinates, plus an optional scale branch for focal-length normalization).
    c                      ^  \ rS rSrS\4U 4S jjr\\ SS\R                  S\R                  S-  S\
\   S\4S	 jj5       5       rS
rU =r$ )Sapiens2ForPointmapEstimationi  r  c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        UR                  b"  UR                  R                  b  [        U5      O[        R                  " 5       U l        U R                  5         g r  )r  r  r  r  r[  r  r  r  r  r   r  
scale_headr  r  s     rI   r  &Sapiens2ForPointmapEstimation.__init__
  sn     "6*
'/ !!-&2D2D2\2\2h &f- 	
 	rH   Nr  r>  r   re   c                    U R                   " U40 UD6nUR                  u  pVpxU R                  R                  n	[	        U	[
        5      (       a  U	OU	S   n
[	        U	[
        5      (       a  U	OU	S   nXz-  nX-  nUR                  SS2SU R                  R                  -   S24   nUR                  SS5      R                  USX5      nU R                  U5      n[	        U R                  [        R                  5      (       a  SOU R                  U5      nSnUb  [        S5      e[        UUUUR                   UR"                  S9$ )a  
labels (`torch.FloatTensor` of shape `(batch_size, 3, height, width)`, *optional*):
    Ground-truth pointmap for computing the loss.

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-pointmap-0.4b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-pointmap-0.4b")

>>> inputs = image_processor(image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> outputs.pointmaps.shape
torch.Size([1, 3, 1024, 768])
```
r   r{   Nr&   rg   r  )rQ   r\   r]   rS   rT   )r  r   r  r  r@  r  r  r  r(  r   r  r  r   r  r  rZ   rS   rT   )r  r  r>  r   r8  r  r   rp   ro   r  r  r  r  r  r  r  r\   r]   rQ   s                      rI   r  %Sapiens2ForPointmapEstimation.forward  s2   > **\4V4'3'9'9$
v[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm-+00A8W8W4W4Y1YZ",,Q2:::r<e$$[1	#DOOR[[AAtWbGc%&EFF.!//))
 	
rH   )r  r  r  r  )rC   rD   rE   rF   r  r  r%   r#   rU   rV   r    r"   rZ   r  rG   r  r  s   @rI   r  r    sm    	~ 	  ,06
''6
 !!D(6
 +,	6

 
)6
  6
rH   r  zfacebook/sapiens2-matting-1bz
    The Sapiens2 model with a matting head on top (a PixelShuffle-based decoder that predicts a
    pre-multiplied RGB foreground and an alpha matte).
    c                      ^  \ rS rSrS\4U 4S jjr\\ SS\R                  S\R                  S-  S\
\   S\4S	 jj5       5       rS
rU =r$ )Sapiens2ForImageMattingiP  r  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r  )r  r  r  r  r[  r  r  r  s     rI   r   Sapiens2ForImageMatting.__init__X  s3     "6*
'/rH   Nr  r>  r   re   c                 \   U R                   " U40 UD6nUR                  u  pVpxU R                  R                  n	[	        U	[
        5      (       a  U	OU	S   n
[	        U	[
        5      (       a  U	OU	S   nXz-  nX-  nUR                  SS2SU R                  R                  -   S24   nUR                  SS5      R                  USX5      nU R                  U5      R                  5       nUSS2SS24   nUSS2SS24   nSnUb  [        S5      e[        UUUUR                  UR                  S9$ )	a	  
labels (`torch.FloatTensor` of shape `(batch_size, 4, height, width)`, *optional*):
    Ground-truth matting targets for computing the loss.

Example:

```python
>>> from transformers import AutoImageProcessor, AutoModel
>>> from transformers.image_utils import load_image
>>> import torch

>>> image = load_image("http://images.cocodataset.org/val2017/000000004016.jpg")
>>> image_processor = AutoImageProcessor.from_pretrained("facebook/sapiens2-matting-1b")
>>> model = AutoModel.from_pretrained("facebook/sapiens2-matting-1b")

>>> inputs = image_processor(image, return_tensors="pt")
>>> with torch.inference_mode():
...     outputs = model(**inputs)

>>> outputs.alphas.shape
torch.Size([1, 1, 1024, 768])
>>> outputs.foregrounds.shape
torch.Size([1, 3, 1024, 768])
```
r   r{   Nr&   rg   r   r  )rQ   r}  ra   rS   rT   )r  r   r  r  r@  r  r  r  r(  r   r  sigmoidr  r_   rS   rT   )r  r  r>  r   r8  r  r   rp   ro   r  r  r  r  r  r  r  r  ra   r}  rQ   s                       rI   r  Sapiens2ForImageMatting.forward^  s7   B **\4V4'3'9'9$
v[[++
%/
C%@%@zjQRm%/
C%@%@zjQRm-+00A8W8W4W4Y1YZ",,Q2:::r<e"";/779a!enAB%&EFF)#!//))
 	
rH   )r  r  r  )rC   rD   rE   rF   r  r  r%   r#   rU   rV   r    r"   r_   r  rG   r  r  s   @rI   r   r   P  sm    ~   ,09
''9
 !!D(9
 +,	9

 
$9
  9
rH   r   )r  r  r  r  r  r  r   r  r  r  r   )g      ?)r  )collections.abcr   r   dataclassesr   typingr   rU   torch.nn.functionalr   r	   r   huggingface_hub.dataclassesr   torchvision.transforms.v2r   &transformers.image_processing_backendsr
   2transformers.models.dinov3_vit.modeling_dinov3_vitr    r   r  activationsr   configuration_utilsr   image_processing_utilsr   image_transformsr   r   image_utilsr   r   r   r   r   r   r   r   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr    utilsr!   r"   r#   r$   utils.genericr%   beit.image_processing_beitr'   r(   #dinov3_vit.configuration_dinov3_vitr)   dinov3_vit.modeling_dinov3_vitr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   gemma2.modeling_gemma2r4   llama.modeling_llamar5    mask2former.modeling_mask2formerr6   0pp_ocrv5_server_det.modeling_pp_ocrv5_server_detr7   sam3.processing_sam3r8   r9   vitmatte.modeling_vitmatter:   vitpose.modeling_vitposer;   r<   
get_loggerrC   loggerr?   rK   rO   rZ   r_   rA  rX   r  r   rx   r   r   r   r   r   r   r  r  r  r  r  r  r:  r<  rE  rw  r[  rs  r{  r  r  r  r  r  r  r  r  r  r   __all__rA   rH   rI   <module>r&     s   / !     .  7 E V & ! 3 2 E	 	 	 a ` F & L L - U A   = / I a G ; H 
		H	%  	4 	 	 
 
"8 
 
 
 <K < <* 
 <k < <0 
 1!3 1 1( #<<#sCx# # 5<<%&	#T 	<<<<<<< sCx< 	<
 \\<~) )s )ELL )<u|| ellELL>X8Y 4 NPE?||E?',||E?GJE?
\\E?P	#;5 	g/ gT 78Sj) Sj  9Sjl <=i(_ i(  >i(X>, >5$B 5	l 	/)* /)d	, 	*N *-: -`5-299 5-pN&@ N( (&-		 -2c6 cD
& 
$XN $XNK
( K
\ 78C
&= C
 9C
L ,G
 7 G
G
T .>
"9 >
>
B 0D
$; D
D
N -B
5 B
B
JrH   