
    3j                        S SK Jr  S SKrS SKJs  Jr  S SKJr  S SK	J
r
  SSKJr  SSKJrJr  SSKJrJrJrJrJrJrJrJr  SS	KJrJr  SS
KJrJrJr  SSK J!r!J"r"J#r#J$r$   " S S\SS9r%S r&S r' S#S\RP                  S\)\*\*4   S\+S\)\RP                  \RP                  4   4S jjr, S#S\RP                  S\RP                  S\)\*\*4   S\+S\RP                  4
S jjr-S$S\RP                  S\*S\RP                  4S jjr.S\RP                  S\)\RP                  \RP                  4   4S jr/ S$S\RP                  S\RP                  S\*S\RP                  4S  jjr0\ " S! S"\
5      5       r1S"/r2g)%    )UnionN)
functional)TorchvisionBackend   )BatchFeature)group_images_by_shapereorder_images)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDChannelDimension
ImageInputPILImageResamplingSizeDict#get_image_size_for_max_height_widthmake_list_of_images)ImagesKwargsUnpack)
TensorTypeauto_docstringis_torch_available   )Sapiens2ImageMattingOutputSapiens2NormalEstimatorOutputSapiens2PointmapEstimatorOutputSapiens2PoseEstimatorOutputc                   $    \ rS rSr% Sr\\S'   Srg)Sapiens2ImageProcessorKwargs1   aW  
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
    Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
    is used for background, and background itself is not included in all classes of a dataset (e.g.
    ADE20k). The background label will be replaced by 255.
do_reduce_labels N)__name__
__module____qualname____firstlineno____doc__bool__annotations____static_attributes__r        p/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/sapiens2/image_processing_sapiens2.pyr   r   1   s     r)   r   F)totalc                 f    U R                  S5      u  pp#XX-   X-   /n[        R                  " USS9$ )Ndimunbindtorchstackxywhbs        r*   box_xywh_to_xyxyr:   <   s4    "JA!
AEQU$A;;qb!!r)   c                 v    U R                  S5      u  pp#U SU-  -   USU-  -   X#/n[        R                  " USS9$ )Nr-         ?r.   r0   r4   s        r*   box_xywh_to_cxcywhr=   B   s@    "JA!
cAg+S1W0A;;qb!!r)   boxesoutput_sizepaddingreturnc           	         U R                  S5      u  p4pV[        R                  " X4/SS9nXR-  nXb-  n	Uu  pX-  n[        R                  " XU-  :  S   [        R                  " XU-  /SS9[        R                  " X-  U	/SS95      nX}4$ )a  Compute crop center and scale from bounding boxes, applying padding and aspect ratio correction.

Accepts either a single box `(4,)` or multiple boxes `(num_boxes, 4)` and returns center/scale with a matching
leading dimension.

Args:
    boxes (`torch.Tensor` of shape `(4,)` or `(num_boxes, 4)`): Bounding box in
        (center-x, center-y, width, height) format, with values in absolute pixel coordinates.
    output_size (`tuple[int, int]`): Target output size as `(height, width)`, used to compute
        the aspect ratio for scale correction.
    padding (`float`, *optional*, defaults to `1.25`): Multiplicative factor applied to the
        bounding box dimensions, adding context around the region of interest.

Returns:
    `tuple[torch.Tensor, torch.Tensor]`: A pair `(center, scale)` where `center` has shape
    `(..., 2)` with (x, y) in input-image pixel coordinates, and `scale` has shape `(..., 2)`
    with (width, height) in input-image pixels representing the dimensions of the padded,
    aspect-ratio-corrected crop window.
r-   r.   ).N)r1   r2   r3   where)r>   r?   r@   center_xcenter_ywidthheightcenterscaled_widthscaled_heightoutput_heightoutput_widthaspect_ratioscales                 r*   boxes_to_crop_paramsrO   H   s    0 ).R(8%H[[(-26F?L$M"-M/LKK	4	4i@\,#>?RH]1=ArJE
 =r)   imagec           	      "   Uu  pEU R                   u  pgn[        XUS9u  pU	R                  S5      u  pU
R                  S5      u  pUS-
  U-  nUS-
  U-  n[        R                  " UU5      S:  n[        R
                  " [        R                  " U[        R                  U R                  S9[        R                  " U[        R                  U R                  S9SS9u  nnUUSS2SS4   -  USS2SS4   -   S	USS2SS4   -  -
  nUUSS2SS4   -  USS2SS4   -   S	USS2SS4   -  -
  n[        R                  " S
U-  US-
  -  S-
  S
U-  US-
  -  S-
  /SS9nUR                   S   n[        R                  " UXdXPR                  U R                  S9nU R                  S5      nUS4U) S44 H[  u  nnUR                  5       (       d  M  [        R                  " UR!                  UR#                  5       SSS5      UU   USSS9UU'   M]     U$ )a  Crops and resizes bounding box regions from the input image to the target output size.

Applies padding and aspect ratio correction to each crop before resizing.
Uses bilinear interpolation for downscaling and bicubic for upscaling.

This implementation is equivalent to the cv2 affine warp with rotation=0 used in the original
Sapiens2 codebase. Rotation is always zero because we don't support rotated bounding boxes.

Args:
    image (`torch.Tensor`): Input image tensor of shape `(C, H, W)` in float32.
    boxes (`torch.Tensor`): Bounding boxes in (center-x, center-y, width, height) format,
        shape `(num_boxes, 4)`, with values in absolute pixel coordinates.
    output_size (`tuple[int, int]`): Target output size as `(height, width)`.
    padding (`float`, *optional*, defaults to `1.25`): Multiplicative factor applied to the
        bounding box dimensions before cropping, adding context around the region of interest.

Returns:
    `torch.Tensor`: Cropped and resized images of shape `(num_boxes, C, output_height, output_width)`.
)r?   r@   r-   r         ?dtypedeviceij)indexingNr<   g       @r.   r   rU   rT   bilinearbicubiczerosT)modepadding_modealign_corners)shaperO   r1   r2   minimummeshgridarangefloat32rU   r3   emptyrT   	unsqueezeanyFgrid_sampleexpandsum)rP   r>   r?   r@   rK   rL   num_channelsinput_heightinput_widthrH   rN   rD   rE   boxes_widthboxes_heightscale_xscale_yis_bilineargrid_ygrid_xin_xin_ygrids	num_boxesoutputimage_4dmaskr\   s                               r*   crop_and_resizer|   n   s(   2 #.M.3kk+L(QXYMFr*H %R 0Ka;.Gq L0G--1C7K^^]%--M\u||LNFF
 GAtTM**Xatm-DDs[YZ\`bfYfMgGggDGAtTM**Xatm-DDs\Z[]acgZgMhGhhDKKt{Q7#=sTz\\]M]?^ad?dekmnEAI[[LVbVbjojujuvF q!H#Z0K<2KL
d88::==
BB7d$"F4L M Mr)   heatmapskernelc                    US-  S:X  d  US::  a  [        S5      eSUS-
  S-  S-
  -  S-   nUS-
  S-  nU R                  SS	9n[        R                  " XX3U4S
SS9n[        R
                  " XQU/X"/S9nUSS2X3* 2X3* 24   nUR                  SS	9n[        R                  " US:  U[        R                  " U5      5      n	[        R                  " US:  XI-  [        R                  " U5      5      n
XzSS2SS4   -  $ )a  Gaussian blur per-keypoint heatmap, preserving the original max value.

Matches cv2.GaussianBlur with sigma=0 which means that the sigma is automatically
computed from the kernel size.

Args:
    heatmaps: Shape `(K, height, width)`.
    kernel: Odd integer kernel size for the Gaussian blur. Must be greater than 1.

Returns:
    `torch.Tensor`: Blurred heatmaps of the same shape as the input.
   r   r   z2Kernel size must be an odd integer greater than 1.g333333?r<   g?)r   r   r.   constant        )r\   value)kernel_sizesigmaN)	
ValueErroramaxrg   padtvFgaussian_blurr2   rC   	ones_like)r}   r~   r   borderorigin_maxespaddedblurredresultresult_maxes
safe_maxesrN   s              r*   gaussian_blur_preserve_maxr      s    zQ&A+MNNFQJ#%)*S0EqjQF==V=,L UU8ff=JVYZFV4DUN[GQww67F;;6;*L\A-|U__\=Z[JKKq(,*CU__UaEbcE!T4-(((r)   c           
         U R                   u  pp4U R                  nU R                  XS5      nUR                  SS9nUR	                  SS9nX-  R                  5       n	X-  R                  5       n
[        R                  " UR                  S5      S:  [        R                  " X/SS9[        R                  " XS4SUS95      nX4$ )a+  Predict keypoint locations and confidence scores from heatmaps.

Args:
    heatmaps: Shape `(num_persons, num_keypoints, height, width)`.

Returns:
    locations: `(num_persons, num_keypoints, 2)` x/y in heatmap pixel coordinates.
    scores: `(num_persons, num_keypoints)` per-keypoint confidence.
r-   r.   r   r   g      rU   )r_   rU   reshaper   argmaxfloatr2   rC   re   r3   full)r}   num_personsnum_keypoints_heatmap_widthrU   heatmap_flatscores
flat_indexlocations_xlocations_y	locationss               r*   get_keypoint_predictionsr      s     4<>>0K__F##KCL2&F$$$,J-446K.557Ks"[.B7

K2DHI
 r)   	keypointsblur_kernel_sizec           	         UR                   u  p4pVUR                  n[        UR                  X4-  XV5      U5      R                  X4XV5      nUR	                  SS5      R                  5       n[        R                  " USSS9nUR                  5       n	US-   n
US-   nX-  nXL-  nU SS2SS2S4   R                  5       S	-   U SS2SS2S	4   R                  5       S	-   U-  -   nX[        R                  " XG[        R                  S
9SSS24   -  -   nX[        R                  " X7[        R                  S
9SS2S4   -  -   nUR                  S5      nSS	SUU* US	-   US	-   * S.nUR                  5        VVVs0 s H  u  u  nnnUU4XU-      _M     nnnnSUS   US   -
  -  nSUS   US   -
  -  nUS   SUS   -  -
  US   -   nUS   SUS   -  -
  US   -   nSUS   US   -
  US   -
  US   -   US   -   US   -
  US   -
  US   -   -  n[        R                  " UR                  5      R                   nUU-   nUU-   nUU-  UU-  -
  nUU-  UU-  -
  U-  nU* U-  UU-  -   U-  nU [        R"                  " UU/SS9-
  $ s  snnnf )a1  Sub-pixel refinement via Hessian on log-heatmaps (UDP Dark Pose).

Args:
    keypoints: Shape `(num_persons, num_keypoints, 2)` x/y in heatmap pixel coordinates.
    heatmaps: Shape `(num_persons, num_keypoints, height, width)`.

Returns:
    `(num_persons, num_keypoints, 2)` refined keypoint locations.
gMbP?g      I@)r   r   r   r   	replicate)r\   r   Nr   r   rX   r-   )r   r   r   r   r   r-   r   r   r-   r   r   r   r-   r-   r<   r   r   r   r   r   r   r   r.   )r_   rU   r   r   clamplogrg   r   flattenlongr2   rb   re   itemsfinforT   epscat)r   r}   r   r   r   heatmap_heightr   rU   heatmaps_paddedheatmaps_flattenedpadded_heightpadded_widthkeypoint_strideperson_strideindexposition_to_index_offsetdxdyoffsetheatmap_values
gradient_x
gradient_y
hessian_xx
hessian_yy
hessian_xyr   determinantoffset_xoffset_ys                                r*   "post_dark_unbiased_data_processingr      sN    AI=K__F)4nTVfgk.H  ~~dD)--/HeeHlEO(002"Q&M 1$L#2O!3MaAg##%)Yq!Qw-?-D-D-F-Jl,ZZEell=W\WaWa&bcgijcj&kkkEELLSXS]S]$^_`bf_f$gggEOOBE q  1$%  NfMkMkMmMm9I"b6R$V^44Mm   t,~e/DDEJt,~e/DDEJ%N4,@(@@>RWCXXJ%N4,@(@@>RWCXXJt

	

	 
	 
		
 
	  
	  
 	!	J ++j&&
'
+
+Cc!Jc!Jz)J,CCKZ'*z*AA[PHj(:
+BBkQHuyy(H!52>>>5s   5I?c            "         ^  \ rS rSrSr\r\R                  r	\
r\rSSS.rSrSSS.rSrSrSrSrSrSrS	\\   4U 4S
 jjr\  S9S\S\S-  S\\\\         S-  S	\\   S\4
U 4S jjj5       r S:S\S\S-  S\\\\         S-  S\S\ S\!\"-  S-  S\#\!S4   S-  S\4S jjr$S\S   S\S   4S jr%   S;S\\&RN                     S\S\(SSS\S\(S \S!\S"\S#\\\   -  S-  S$\\\   -  S-  S%\S-  S&\S'\S\\\\         S-  S\\&RN                     4 S( jjr)S:S)\\*   S-  4S* jjr+     S<S+\,S\\\\         S,\,S-  S-\-S.\S-  S/\"\\*\-\-4      -  S-  S)\"\\*\-\-4      -  S-  S\\\.\!\&RN                  4         4S0 jjr/   S=S+\0S/\"\\*\-\-4      -  S-  S)\"\\*\-\-4      -  S-  S1\S-  S\\.\!\&RN                  4      4
S2 jjr1   S=S+\2S/\"\\*\-\-4      -  S-  S)\"\\*\-\-4      -  S-  S1\S-  S\\.\!\&RN                  4      4
S3 jjr3  S9S+\4S)\"\\*\-\-4      -  S-  S4\S-  S\\.\!\&RN                  4      4S5 jjr5S6\&RN                  S/\"\\*\-\-4      -  S-  S)\"\\*\-\-4      -  S-  S1\S-  S\\&RN                     4
S7 jr6S8r7U =r8$ )>Sapiens2ImageProcessori,  z3PIL backend for Sapiens2 with reduce_label support.i   i   )rG   rF   T   Fkwargsc                 &   > [         TU ]  " S0 UD6  g )Nr    )super__init__)selfr   	__class__s     r*   r   Sapiens2ImageProcessor.__init__?  s    "6"r)   Nimagessegmentation_mapsr>   rA   c                 (   > [         TU ]  " XU40 UD6$ )a  
segmentation_maps (`ImageInput`, *optional*):
    The segmentation maps to preprocess.
boxes (`list[list[list[float]]]` or `np.ndarray`, *optional*):
    List or array of bounding boxes for each image. Each box should be a list of 4 floats
    representing the bounding box coordinates in COCO format
    (top_left_x, top_left_y, width, height). When provided, each person crop is
    affine-warped to the model input size instead of resizing the full image.
)r   
preprocess)r   r   r   r>   r   r   s        r*   r   !Sapiens2ImageProcessor.preprocessB  s    " w!&UMfMMr)   do_convert_rgbinput_data_formatreturn_tensorsrU   ztorch.devicec                    X8S'   U R                  XXWS9nUR                  5       n	SU	S'   0 n
U R                  " U40 U	D6U
S'   Ub  U R                  USS[        R                  S9nUR                  5       nUR                  SSS.5        U R                  " SS	U0UD6nU Vs/ s H1  nUR                  S
5      R                  [        R                  5      PM3     nnXS'   [        XS9$ s  snf )z"Handle extra inputs beyond images.r>   )r   r   r   rU   Fr   pixel_valuesr   )r   expected_ndimsr   r   )do_normalize
do_rescaler   r   labels)datatensor_typer    )_prepare_image_like_inputscopy_preprocessr   FIRSTupdatesqueezetor2   int64r   )r   r   r   r>   r   r   r   rU   r   images_kwargsr   processed_segmentation_mapssegmentation_maps_kwargsprocessed_segmentation_maps                 r*   _preprocess_image_like_inputs4Sapiens2ImageProcessor._preprocess_image_like_inputsU  s(     w00L] 1 
 ,1()#//H-H^ (*.*I*I( $"2"8"8	 +J +' (.{{}$$++URW,XY*.*:*: +2+6N+' 3N+2M. +221588E2M ( + 9NBB+s   !8C(r   ztorch.Tensorc           
      b   [        [        U5      5       H  nX   n[        R                  " US:H  [        R                  " SUR
                  UR                  S9U5      nUS-
  n[        R                  " US:H  [        R                  " SUR
                  UR                  S9U5      nX1U'   M     U$ )z/Reduce label values by 1, replacing 0 with 255.r      rS   r      )rangelenr2   rC   tensorrT   rU   )r   r   idxlabels       r*   reduce_label#Sapiens2ImageProcessor.reduce_label  s    V%CKEKK
ELLEKKX]XdXd,eglmEAIEKKell3ekkZ_ZfZf.ginoE3K & r)   	do_resizesizeresamplez7PILImageResampling | tvF.InterpolationMode | int | Nonedo_center_crop	crop_sizer   rescale_factorr   
image_mean	image_stddisable_groupingr   do_padc           	      X   Ub  US   US   4n/ n[        X5       H{  u  nn[        R                  " U[        R                  SS9n[        [        R                  " U[        R                  UR                  S95      nUR                  [        UUUS95        M}     UnSnU(       a  U R                  U5      n[        XS9u  nn0 nUR                  5        Ha  u  nnU(       aO  U(       a6  [        US   US   S9nU R                  UUU5      nU R                  UU5      nOU R                  UX45      nUUU'   Mc     [!        UU5      n[        UUS9u  nn0 nUR                  5        H8  u  nnU(       a  U R                  UU5      nU R#                  UXxXU5      nUUU'   M:     [!        UU5      $ )	z"Custom preprocessing for Sapiens2.rG   rF   FrT   rN   rS   )r>   r?   )r  )
max_height	max_width)zipr   to_dtype_imager2   rc   r=   r   rU   extendr|   r   r   r   r   resizecenter_cropr	   rescale_and_normalize)r   r   r   r  r  r  r  r   r  r   r  r  r  r   r	  r>   r   r?   cropsrP   image_boxesboxes_tensorgrouped_imagesgrouped_images_indexresized_images_groupedr_   stacked_imagesaspect_ratio_sizeresized_imagesprocessed_images_groupeds                                 r*   r   "Sapiens2ImageProcessor._preprocess  s   ( >4=9KE&)&&8"{**5US1%,,{RWR_R_hmhtht2uv_U,T_`a '9 FI&&v.F/DV/o,,!#%3%9%9%;!E>(0DNVZ[bVc(d%%)[[ART\%]N%)%5%5nd%KN%)[[%PN,:"5) &< ((>@TU/D^fv/w,,#% %3%9%9%;!E>!%!1!1.)!L!77
LV_N /=$U+ &< 68LMMr)   target_sizesc                 L   [        5       (       d  [        S5      eUR                  nUb  [        U5      [        U5      :w  a  [	        S5      e[        U[        R                  5      (       a  UR                  5       n/ n[        [        U5      5       HN  n[        R                  " X5   R                  SS9X%   SSS9nUS   R                  SS9nUR                  U5        MP     U$ UR                  SS9n[        UR                  S   5       Vs/ s H  oU   PM	     nnU$ s  snf )	a  
Converts the output of [`Sapiens2ForSemanticSegmentation`] into semantic segmentation maps.

Args:
    outputs ([`Sapiens2ForSemanticSegmentation`]):
        Raw outputs of the model.
    target_sizes (`list[Tuple]` of length `batch_size`, *optional*):
        List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
        predictions will not be resized.

Returns:
    semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
    segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
    specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
z:PyTorch is required for post_process_semantic_segmentationzTMake sure that you pass in as many target sizes as the batch dimension of the logitsr   r.   rY   F)r  r\   r^   r   )r   ImportErrorlogitsr   r   
isinstancer2   Tensornumpyr   rg   interpolatere   r   appendr_   )	r   outputsr  r"  semantic_segmentationr   resized_logitssemantic_mapis	            r*   "post_process_semantic_segmentation9Sapiens2ImageProcessor.post_process_semantic_segmentation  s+     "##Z[[ #6{c,// j  ,55+113$&!S[)!"K))a)0|7Hzin"  .a077A7>%,,\: * %$ %+MMaM$8!GLMbMhMhijMkGl$mGl!1%=Gl!$m$$ %ns   D!r(  outputs_flippedr   	thresholdsource_sizesc                     [        U[        R                  5      (       a  UR                  5       n[        U[        R                  5      (       a  UR                  5       n[	        U5      nUb  Uc  [        S5      eUb  U[	        U5      :w  a  [        S5      eUb  U[	        U5      :w  a  [        S5      eUR                  n	Ub  XR                  -   S-  n	U	R                  n
U	R                  u  ppUS:X  a  U Vs/ s H  n/ PM     sn$ [        R                  " U VVs/ s H  nU  H  nUPM     M     snn[        R                  U
S9nU	R                  5       n	[        U	5      u  nn[        UXS9n[        [        U5      U R                   S	   U R                   S
   4S9u  nn[        R                  " US-
  US-
  /[        R                  U
S9nUU-  USS2SSS24   -  USS2SSS24   -   SUSS2SSS24   -  -
  n[#        U5      nUb  Ub  [        R                  " [%        Xg5       VVVVs/ s H  u  u  nnu  nnUU-  UU-  /PM     snnnn[        R                  U
S9n[        R&                  " [)        U5       Vs/ s H3  nUU   R+                  S5      R-                  [	        UU   5      S5      PM5     sn5      nUUSS2SSS24   -  nUUSS2/ SQ4   -  n/ n [)        U5       HQ  n!UU!   n"UU!   n#[        R.                  " XS9n$Ub  U#U:  n%U"U%   n"U#U%   n#U$U%   n$U R1                  U"U#U$UU!   S.5        MS     / n&Sn'U H*  n[	        U5      n(U&R1                  U U'U'U(-    5        U'U(-  n'M,     U&$ s  snf s  snnf s  snnnnf s  snf )a	  
Converts the output of [`Sapiens2ForPoseEstimation`] into keypoint predictions in image space.

Args:
    outputs (`Sapiens2PoseEstimatorOutput`):
        Raw outputs of the model. `outputs.heatmaps` must have shape
        `(N_total, num_keypoints, heatmap_height, heatmap_width)` where
        `N_total = sum(len(b) for b in boxes)`.
    boxes (`list[list[list[float]]]` or `np.ndarray`):
        List or array of bounding boxes for each image in absolute pixel coordinates. Each box
        should be a list of 4 floats representing the bounding box coordinates in COCO format
        (top_left_x, top_left_y, width, height). Must match the `boxes` argument passed to
        `preprocess`.
    outputs_flipped (`Sapiens2PoseEstimatorOutput`, *optional*):
        Outputs from running the model on horizontally flipped inputs. When provided, heatmaps
        are averaged with `outputs` before keypoint extraction to improve accuracy:
        `avg_heatmaps = (outputs.heatmaps + outputs_flipped.heatmaps) / 2`.
    kernel_size (`int`, *optional*, defaults to 11):
        Kernel size for the Gaussian blur used in UDP Dark Pose refinement.
    threshold (`float`, *optional*):
        Score threshold. Keypoints with scores at or below this value are
        filtered out from the result dictionaries.
    source_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Original `(height, width)` of each image in pixels. Required when `target_sizes` is
        provided, as the source coordinate space for scaling keypoints and bounding boxes.
    target_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Desired output `(height, width)` coordinate space for each image. When provided
        alongside `source_sizes`, keypoint coordinates and bounding boxes are scaled from
        source to target space.

Returns:
    `list[list[dict]]`: Outer list is over images, inner list is over persons.
    Each dict contains:
    - `keypoints` (`torch.FloatTensor` of shape `(num_keypoints, 2)`): absolut x/y coordinates in
      the source image space, or in target space if `target_sizes` is provided.
    - `scores` (`torch.FloatTensor` of shape `(num_keypoints,)`): per-keypoint confidence.
    - `labels` (`torch.LongTensor` of shape `(num_keypoints,)`): keypoint indices.
    - `bbox` (`torch.FloatTensor` of shape `(4,)`): bounding box in absolute (x_min, y_min, x_max, y_max)
       format, in the same coordinate space as `keypoints`.
NzA`source_sizes` must be provided when `target_sizes` is specified.zHMake sure that you pass in as many source sizes as the number of images.zHMake sure that you pass in as many target sizes as the number of images.r   r   rS   )r   r}   r   rG   rF   )r?   r   r<   )r   r   r   r   r   )r   r   r   bbox)r#  r2   r$  tolistr   r   r}   rU   r_   r   rc   r   r   r   rO   r=   r  r:   r  r   r   re   ri   rb   r'  ))r   r(  r>   r/  r   r0  r1  r  
num_imagesr}   rU   num_total_personsr   r   r   r   r  boxr  all_keypoints
all_scorescentersscalesheatmap_size	all_boxessource_heightsource_widthtarget_heighttarget_widthper_image_scaleimage_indexper_person_scaleperson_resultsperson_indexr   r   r   keepr   person_offsetnum_persons_in_images)                                            r*   post_process_pose_estimation3Sapiens2ImageProcessor.post_process_pose_estimation  s   d lELL11'..0LlELL11'..0LZ
#(<`aa#
c,6G(Gghh#
c,6G(Gghh##& #;#;;q@HJR..G.! %&1B&& ||$)AE[[cS[SEA_e
 >># %=X$F!z:#h

 /|,499X;NPTPYPYZaPb:c
 ||]Q%68J$KSXS`S`iopL(6!T1*+==4QR
@SSVY\bcdfjlmcm\nVnn 	 %\2	#(@#ll Y\\hXwXwT57T| "L0--2OPXw mmO  %yy (-Z'8'8 $K0::1=DDS{I[E\^_`'8  *,<QaZ,HHM!$4Q_$EEI!"34L%l3I-F\\-?F$	)%dO	!!'6VU^_kUlm 5   K#&{#3 MM.I]9]^_11M ! I ' B0s   :M8M=
0N:Ndo_remove_paddingc                     [         R                  " UR                  SSSS9nU R                  XRX4S9nU Vs/ s H  nSU0PM	     sn$ s  snf )a  
Converts the output of [`Sapiens2ForNormalEstimation`] into L2-normalized surface normal maps.

Args:
    outputs (`Sapiens2NormalEstimatorOutput`):
        Raw outputs of the model.
    source_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Original `(height, width)` of each image before preprocessing. When provided,
        the padding added during preprocessing is removed and predictions are resized back
        to the original image size (unless `target_sizes` overrides the final size).
    target_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Requested final `(height, width)` for each prediction. When provided, used as the
        resize target instead of `source_sizes`. Resized with bilinear interpolation after
        L2 normalization.
    do_remove_padding (`bool`, *optional*):
        Whether to crop away the zero-padding added during preprocessing before resizing.
        Defaults to `True` when `source_sizes` is provided, `False` otherwise.

Returns:
    `list[dict[str, torch.Tensor]]` of length `batch_size`. Each dict has a `"normals"` key
    mapping to a tensor of shape `(3, height, width)` with L2-normalized unit vectors in
    `[-1, 1]` per channel (XYZ surface normals).
r   r   g:0yE>)pr/   r   mapsr1  r  rL  normals)rg   	normalizerQ  _post_process_maps)r   r(  r1  r  rL  rQ  resultsr   s           r*   post_process_normal_estimation5Sapiens2ImageProcessor.post_process_normal_estimation  sU    < ++gootD)), * 
 3::'F#':::s   Ac                     UR                   nUR                  b  XQR                  SS2SS2SS4   -  nU R                  XRX4S9nU Vs/ s H  nSU0PM	     sn$ s  snf )a  
Converts the output of [`Sapiens2ForPointmapEstimation`] into pointmap tensors in image space.

Args:
    outputs (`Sapiens2PointmapEstimatorOutput`):
        Raw outputs of the model.
    source_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Original `(height, width)` of each image before preprocessing. When provided,
        the padding added during preprocessing is removed and predictions are resized back
        to the original image size (unless `target_sizes` overrides the final size).
    target_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Requested final `(height, width)` for each prediction. Overrides `source_sizes`
        as the resize target.
    do_remove_padding (`bool`, *optional*):
        Whether to crop away the zero-padding added during preprocessing before resizing.
        Defaults to `True` when `source_sizes` is provided, `False` otherwise.

Returns:
    `list[dict[str, torch.Tensor]]` of length `batch_size`. Each dict has a `"pointmap"` key
    mapping to a tensor of shape `(3, height, width)` with per-pixel 3D XYZ coordinates in
    canonical camera space, optionally divided by `outputs.scales` to convert to metric coordinates.
NrO  pointmap)	pointmapsr;  rS  )r   r(  r1  r  rL  rY  rT  r   s           r*    post_process_pointmap_estimation7Sapiens2ImageProcessor.post_process_pointmap_estimation  sl    : %%	>>%!NN1at3C$DDI))L * 
 4;;7V$7;;;s   Abackgroundsc           	        ^^ [        T[        R                  5      (       a  TR                  5       mUR                  R
                  S   nUR                  R                  nUR                  R                  nTb  U[        T5      :w  a  [        S5      eTSL =(       d    [        U4S jT 5       5      n/ mUb~  [        U5      n[        U5      S:w  a  [        U5      U:w  a  [        S5      eU V	s/ s H<  n	[        R                  " [        R                  " U	5      USS9R                  U5      PM>     sn	mT(       + =(       d    [        U4S	 jT 5       5      n
[        R                   " UR                  UR"                  /SS
9nTb?  U(       a8  [%        TS   5      n[&        R(                  " UUSSSS9nUR+                  SS5      n/ nU(       a  U
(       a  USS2SS24   nUSS2SS24   nS/U-  nT(       a  [        R,                  " T5      nUR
                  SS UR
                  SS :w  a%  [&        R(                  " UUR
                  SS SSSS9nUSU-
  U-  -   R+                  SS5      n[        R                  " U[        R.                  SS9n[1        XU5       H  u  nnnUR3                  UUUS.5        M     U$ [5        [        U5      5       GH-  nUU   nT(       aF  U(       d?  [&        R(                  " UR7                  S5      TU   SSSS9S   nUR+                  SS5      nUSS nUSS nSnT(       a  [        T5      S:X  a  TS   OTU   nUR
                  SS UR
                  SS :w  a7  [&        R(                  " UR7                  S5      UR
                  SS SSSS9S   nUSU-
  U-  -   R+                  SS5      n[        R                  " U[        R.                  SS9nUR3                  UUUS.5        GM0     U$ s  sn	f )a`  
Converts the output of [`Sapiens2ForImageMatting`] into alpha mattes and foreground maps.

Args:
    outputs (`Sapiens2ImageMattingOutput`):
        Raw outputs of the model.
    target_sizes (`torch.Tensor` or `list[tuple[int, int]]` of length `batch_size`, *optional*):
        Requested final `(height, width)` for each prediction. Resized with bilinear
        interpolation. If unset, predictions are returned at the model output resolution.
    backgrounds (`ImageInput`, *optional*):
        Background image(s) to composite over. Can be a single image (applied to every item
        in the batch) or a list of images, one per batch item. Accepts PIL images, numpy
        arrays, or torch tensors of any dtype; integer types (e.g. uint8) are scaled to
        `[0, 1]` automatically. When provided, each result dict gains a `"composite"` key
        with the composited image as a uint8 tensor in `[0, 255]`.

Returns:
    `list[dict]` of length `batch_size`. Each dict has:
    - `"alpha"` (`torch.Tensor` of shape `(1, height, width)`): alpha values in `[0, 1]`.
    - `"foreground"` (`torch.Tensor` of shape `(3, height, width)`): pre-multiplied RGB in `[0, 1]`.
    - `"composite"` (`torch.Tensor` of shape `(3, height, width)` or `None`): foreground composited
      over `backgrounds` as a uint8 tensor in `[0, 255]`; `None` when `backgrounds` is not provided.
r   Nz\Make sure that you pass in as many target sizes as the batch dimension of the matting outputc              3   X   >#    U  H  n[        U5      [        TS    5      :H  v   M!     g7fr   N)tuple).0r  r  s     r*   	<genexpr>DSapiens2ImageProcessor.post_process_image_matting.<locals>.<genexpr>  s&      =
>JdE$K5a11ls   '*r   z[Make sure that you pass in as many backgrounds as the batch dimension of the matting outputTr  c              3   f   >#    U  H&  oR                   S S TS   R                   S S :H  v   M(     g7f)Nr   )r_   )ra  
backgroundbackground_tensorss     r*   rb  rc    s9      C
Xj*RS!%7%:%@%@%EEXjs   .1r.   rY   Fr  r\   r^   	antialiasr   rR   r   re  )
foregroundalpha	composite)r#  r2   r$  r4  foregroundsr_   rU   rT   r   r   allr   r   r  to_imager   r   alphasr`  rg   r&  r   r3   uint8r  r'  r   re   )r   r(  r  r\  
batch_sizerU   rT   all_target_sizes_equalbackground_listbackground_imageall_background_sizes_equalmattingtarget_sizer   rm  rp  
compositesrf  rj  rk  rl  r   matting_itemrg  s     `                    @r*   post_process_image_matting1Sapiens2ImageProcessor.post_process_image_matting  s4   : lELL11'..0L((..q1
$$++##))#S.. r  ".!5 "
 =
>J=
 :
  "1+>O?#q(S-AZ-O q 
 )8"(7$ ""3<<0@#AVZ[^^_ef(7" *<%; &
s C
XjC
 @
" ))W00'..AqI#(>Q0Kmm #G mmC-G!&@!!RaR%.KQU^F*,J!"[[);<
##BC(GMM"#,>>!""$]]23/'&+"'"J *QZ:,EELLSRUV
 //
%++UYZ
03K0T,
E9&0!&%. 1UV A s7|,&u~(>#$==$..q1)%0'&+"'$ $L $0#5#5c3#?L)"1-
$QR( 	%:=>P:QUV:V!3A!6\not\uJ!'',0B0B230GG%&]]&003!-!3!3BC!8!+*/&+& &
 ",q5yJ.F!F M McSV WI # 2 29EKKW[ \IZ%V_`a= -@ g"s   1AP
rP  c                   ^^ [        U[        R                  5      (       a  UR                  5       n[        U[        R                  5      (       a  UR                  5       nUc  US LnU(       a  Uc  [	        S5      eUb#  [        U5      [        U5      :w  a  [	        S5      eUb#  [        U5      [        U5      :w  a  [	        S5      eU R                  S   nU R                  S   n/ mU(       ac  U H]  u  px[        Xx4XV5      u  pX:  a  XY-
  S-  OSnX:  a  Xj-
  S-  OSnTR                  UUU[        X5      -   U[        X5      -   45        M_     T(       + =(       d    [        U4S jT 5       5      n/ mUb  U Vs/ s H  n[        U5      PM     snmOUb  U Vs/ s H  n[        U5      PM     snmT(       + =(       d    [        U4S	 jT 5       5      n/ nU(       aX  U(       aQ  U(       a  TS   u  nnnnUS S 2S S 2UU2UU24   nT(       a  [        R                  " UTS   S
SSS9n[        U5      nU$ [        [        U5      5       Hl  nUU   nU(       a  TU   u  nnnnUS S 2UU2UU24   nT(       a-  [        R                  " UR!                  S5      TU   S
SSS9S   nUR                  U5        Mn     U$ s  snf s  snf )Nz>`source_sizes` must be provided when `do_remove_padding=True`.zUMake sure that you pass in as many source sizes as the batch dimension of the outputszUMake sure that you pass in as many target sizes as the batch dimension of the outputsrG   rF   r   r   c              3   2   >#    U  H  oTS    :H  v   M     g7fr_  r    )ra  cropr  s     r*   rb  <Sapiens2ImageProcessor._post_process_maps.<locals>.<genexpr>}  s     *N58+;   c              3   2   >#    U  H  oTS    :H  v   M     g7fr_  r    )ra  r  final_sizess     r*   rb  r    s     6fZeRV{1~7MZer  rY   Frh  )r#  r2   r$  r4  r   r   r  r   r'  minrn  r`  rg   r&  listr   re   )r   rP  r1  r  rL  model_heightmodel_widthoriginal_heightoriginal_width
new_height	new_widthpad_toppad_leftall_crops_equalr  all_final_sizes_equalr   topleftbottomrightr   map_itemr  r  s                          @@r*   rS  )Sapiens2ImageProcessor._post_process_mapsU  s    lELL11'..0LlELL11'..0L$ ,D 8!5]^^#D	S5F(Ftuu#D	S5F(Ftuuyy*ii(3?/(K$5|)%
 ?I>W<4:]^=F=TK39Z[ #j"?? 3y#>>	 4@ $)Ns*N*N'N#3?@<45;<@K%3?@<45;<@K$/ f36fZe6f3f4 +08(T65Aq#f*d5j89}}$Q#"'# $ZF* % s4y);$/4U|,Cvu'3v:tEz(ABH }} **1-(/'&+"'   H h'! *$ U A@s   ?KKr    )NN)N)FFN)N   NNN)NNN)9r!   r"   r#   r$   r%   r   valid_kwargsr   BILINEARr  r
   r  r   r  r  default_to_squarer  r   r  r   r   r   r	  r   r   r   r   r  r   r   r   r&   r   strr   r   r   r   r2   r$  r   r   r`  r-  r   intdictrJ  r   rU  r   rZ  r   r{  rS  r(   __classcell__)r   s   @r*   r   r   ,  so   =/L!**H&J$IS)D-IINJLF#(D!E #  0404	NN &,N De%&-	N
 56N 
N N4 59,C,C &,,C De%&-	,C
 ,C ,,C j(4/,C c>)*T1,C 
,C\4#7 D<P 0 "'04!9NU\\"9N 9N 	9N
 L9N 9N 9N 9N 9N 9N DK'$.9N 4;&-9N +9N 9N 9N  De%&-!9N$ 
ell	%9Nv+%UVZHZ +%b ?C"&BFBFL,L De%&L 5t;	L
 L 4<L !4c3h#884?L !4c3h#884?L 
d4U\\)*+	,Lb CGBF)-";."; !4c3h#884?"; !4c3h#884?	";
  $;"; 
d3$%	&";N CGBF)-#<0#< !4c3h#884?#< !4c3h#884?	#<
  $;#< 
d3$%	&#<P CG)-	G+G !4c3h#884?G  $&	G
 
d3$%	&GRVllV !4c3h#884?V !4c3h#884?	V
  $;V 
ell	V Vr)   r   )g      ?)r  )3typingr   r2   torch.nn.functionalnnr   rg   torchvision.transforms.v2r   &transformers.image_processing_backendsr   image_processing_utilsr   image_transformsr   r	   image_utilsr
   r   r   r   r   r   r   r   processing_utilsr   r   utilsr   r   r   modeling_sapiens2r   r   r   r   r   r:   r=   r$  r`  r  r   rO   r|   r   r   r   r   __all__r    r)   r*   <module>r     s  &     7 E 2 E	 	 	 5 C C <u "" #<<#sCx# # 5<<%&	#T 	<<<<<<< sCx< 	<
 \\<~) )s )ELL )<u|| ellELL>X8Y 4 NPE?||E?',||E?GJE?
\\E?P ~	/ ~	 ~	B $
$r)   