
    3j4                         S SK Jr  SSKJr  SSKJr  SSKJr  \" SS9\ " S S	\5      5       5       r\" S
S9\ " S S\\5      5       5       r	SS	/r
g)    )strict   )BackboneConfigMixin)PreTrainedConfig)auto_docstringzfacebook/sapiens2-seg-0.4b)
checkpointc                     ^  \ rS rSr% SrSrSrSr\\	   S-  \
S'   Sr\\	   S-  \
S'   Sr\	\
S	'   Sr\S-  \
S
'   Sr\\	   S-  \
S'   Sr\\	   S-  \
S'   Sr\	\
S'   Sr\\	   S-  \
S'   Sr\\	   S-  \
S'   Sr\	\
S'   Sr\	S-  \
S'   Sr\\	   S-  \
S'   U 4S jrS\	\\	   -  \\	\	4   -  S\	\\	   -  \\	\	4   -  SS4S jrSrU =r$ )Sapiens2HeadConfig   a  
upsample_out_channels (`list[int]`, *optional*):
    Output channel counts for each upsample block.
    The first block takes `hidden_size` channels as input; subsequent blocks use the previous output.
upsample_kernel_sizes (`list[int]`, *optional*):
    Kernel size for each upsample block. Auto-filled with `[4, ...]` when
    `upsample_out_channels` is set but this is `None`.
    Must have the same length as `upsample_out_channels`.
upsample_kernel_size (`int`, defaults to 4):
    Default kernel size for upsample blocks when `upsample_kernel_sizes` is not set.
use_pixel_shuffle (`bool`, *optional*):
    Whether the upsample head uses pixel-shuffle upsampling instead of transposed convolutions.
    When `None` (default), the head uses transposed convolutions.
conv_out_channels (`list[int]`, *optional*):
    Output channel counts for the refinement conv layers that follow the upsample blocks.
conv_kernel_sizes (`list[int]`, *optional*):
    Kernel size for each refinement conv layer. Auto-filled with `[1, ...]` when
    `conv_out_channels` is set but this is `None`.
    Must have the same length as `conv_out_channels`.
conv_kernel_size (`int`, defaults to 1):
    Default kernel size for conv layers when `conv_kernel_sizes` is not set.
scale_conv_out_channels (`list[int]`, *optional*):
    Output channel counts for the stride-2 conv layers used to predict the focal-length scale.
    When `None` (default), no scale branch is built.
scale_conv_kernel_sizes (`list[int]`, *optional*):
    Kernel size for each scale conv layer. Auto-filled with `[1, ...]` when
    `scale_conv_out_channels` is set but this is `None`.
    Must have the same length as `scale_conv_out_channels`.
scale_conv_kernel_size (`int`, defaults to 1):
    Default kernel size for scale conv layers when `scale_conv_kernel_sizes` is not set.
scale_final_input_size (`int`, *optional*):
    Flattened feature size passed into the scale MLP.
    When `None` (default), it is automatically inferred from `image_size` and `patch_size`
    in the parent [`Sapiens2Config`].
scale_final_hidden_sizes (`list[int]`, *optional*):
    Hidden-layer sizes for the MLP that maps flattened scale features to the scalar scale output.
    When `None` (default), no scale branch is built.
sapiens2_headhead_configNupsample_out_channelsupsample_kernel_sizes   upsample_kernel_sizeuse_pixel_shuffleconv_out_channelsconv_kernel_sizes   conv_kernel_sizescale_conv_out_channelsscale_conv_kernel_sizesscale_conv_kernel_sizescale_final_input_sizescale_final_hidden_sizesc                   > U R                   b5  U R                  c(  U R                  /[        U R                   5      -  U l        U R                  b5  U R
                  c(  U R                  /[        U R                  5      -  U l        U R                  b5  U R                  c(  U R                  /[        U R                  5      -  U l        [        TU ],  " S0 UD6  g )N )r   r   r   lenr   r   r   r   r   r   super__post_init__)selfkwargs	__class__s     m/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/sapiens2/configuration_sapiens2.pyr     Sapiens2HeadConfig.__post_init__T   s    %%1d6P6P6X*.*C*C)Ds4KeKeGf)fD&!!-$2H2H2P&*&;&;%<s4CYCY?Z%ZD"''38T8T8\,0,G,G+H3tOkOkKl+lD(''    
image_size
patch_sizereturnc                    U R                   c  U R                  b  U R                  c  g [        U[        [
        45      (       a  UOX4u  p4[        U[        5      (       a  UOUS   n[        U[        5      (       a  UOUS   nX5-  nXF-  nU R                   H-  n	U	S-
  S-  n
USU
-  -   U	-
  S-  S-   nUSU
-  -   U	-
  S-  S-   nM/     Xx-  U R                  S   -  U l         g )Nr   r      )r   r   r   
isinstancelisttupleint)r!   r'   r(   image_heightimage_widthpatch_heightpatch_widthfeatures_heightfeatures_widthkernel_sizepaddings              r$   _init_scale_final_input_size/Sapiens2HeadConfig._init_scale_final_input_size]   s    ''3++3++32<Z$PU2W2WJ^h]u!%/
C%@%@zjQRm$.z3$?$?jZPQ]&6$377K"Q1,G.W<{JqPSTTO,q7{:[HQNQRRN 8 '6&FIeIefhIi&i#r&   )r   r   r   r   )__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyr   r.   r0   __annotations__r   r   r   boolr   r   r   r   r   r   r   r   r    r/   r9   __static_attributes____classcell__r#   s   @r$   r
   r
      s<   %N !J#O.249t+2.249t+2 !#!%)td{)*.tCy4'.*.tCy4'.c04T#Y-404T#Y-4"#C#)-C$J-15d3i$.5(jS	/E#s(O;jILtTWy[`adfiai[jIjj	j jr&   r
   zfacebook/sapiens2-pretrain-0.4bc                     ^  \ rS rSr% SrSrSr\\\   -  \	\\4   -  \
S'   Sr\\
S'   Sr\\
S	'   S
r\\
S'   Sr\\
S'   Sr\\
S'   Sr\\-  \
S'   Sr\\
S'   Sr\\
S'   Sr\\\   -  \	\\4   -  \
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S'   Sr\\
S '   Sr\\-  \
S!'   Sr\\
S"'   S#r \\
S$'   S%r!\S%-  \
S&'   S%r"\S%-  \
S''   S(r#\S%-  \
S)'   S%r$\\   S%-  \
S*'   S%r%\\   S%-  \
S+'   Sr&\\
S,'   S-\'0r(S.r)\\
S/'   S0r*\\
S1'   Sr+\\
S2'   Sr,\\
S3'   S%r-\\   S%-  \
S4'   S#r.\\
S5'   S#r/\\
S6'   S#r0\\
S7'   S8r1\\
S9'   S%r2\\\      S%-  \
S:'   S%r3\'\4-  S%-  \
S-'   U 4S; jr5S<r6U =r7$ )=Sapiens2Configr   uR  
rope_theta (`float`, *optional*, defaults to 100.0):
    The base period of the RoPE embeddings.
query_bias (`bool`, *optional*, defaults to `True`):
    Whether to add a bias to the query projection.
key_bias (`bool`, *optional*, defaults to `False`):
    Whether to add a bias to the key projection.
value_bias (`bool`, *optional*, defaults to `True`):
    Whether to add a bias to the value projection.
proj_bias (`bool`, *optional*, defaults to `True`):
    Whether to add a bias to the output projection.
layerscale_value (`float`, *optional*, defaults to 1.0):
    Initial value to use for layer scale.
use_gated_mlp (`bool`, *optional*, defaults to `False`):
    Whether to use the SwiGLU feedforward neural network.
num_register_tokens (`int`, *optional*, defaults to 0):
    The number of register tokens.
pos_embed_shift (`float`, *optional*):
    Amount to randomly shift position embedding coordinates in [-shift, shift],
    applied only in training mode if not `None`.
pos_embed_jitter (`float`, *optional*):
    Amount to randomly jitter position embedding coordinates in log-uniform value in [1/jitter, jitter],
    applied only in training mode if not `None`.
pos_embed_rescale (`float`, *optional*, defaults to 2.0):
    Amount to randomly rescale position embedding coordinates in log-uniform value in [1/rescale, rescale],
    applied only in training mode if not `None`.
reshape_hidden_states (`bool`, *optional*, defaults to `True`):
    Whether to reshape the hidden states to spatial dimensions when used as backbone.
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether to use a mask token in the embeddings (needed for masked image modeling pretraining).
rms_norm_eps (`float`, *optional*, defaults to 1e-6):
    Epsilon for the RMS normalization layers.
normalize_backbone_outputs (`bool`, *optional*, defaults to `True`):
    Whether to apply RMSNorm to the backbone `feature_maps` and `cls_tokens` outputs before
    returning them from the forward pass. Only applies when the model is used as a backbone.
use_qk_norm (`bool`, *optional*, defaults to `True`):
    Whether to apply RMSNorm to queries and keys before RoPE in attention layers.
num_key_value_heads_per_layer (`list[int]`, *optional*):
    Number of key/value heads for each transformer layer. Setting a layer's value equal to
    `num_attention_heads` gives full multi-head attention; a smaller value gives grouped-query
    attention. Defaults to `num_attention_heads` for the first `num_first_full_attention_layers`
    and last `num_last_full_attention_layers` layers and `num_key_valueattention_heads` for all other
    layers.
num_key_value_attention_heads (`int`):
    Number of key/value heads for layers that use grouped-query attention when `num_key_value_heads_per_layer`
    is not set. Ignored when `num_key_value_heads_per_layer` is set.
num_first_full_attention_layers (`int`, *optional*, defaults to 8):
    Number of leading transformer layers that use full multi-head attention.
    Only used when `num_key_value_heads_per_layer` is `None`.
num_last_full_attention_layers (`int`, *optional*, defaults to 8):
    Number of trailing transformer layers that use full multi-head attention.
    Only used when `num_key_value_heads_per_layer` is `None`.
semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
    Label index ignored when computing the segmentation loss.
flip_pairs (`list[list[int]]`, *optional*):
    Pairs of keypoint indices that are mirrored horizontally (e.g., left ear ↔ right ear).
    Each pair is a two-element list `[left_index, right_index]`. Used for test-time
    horizontal flip augmentation in pose estimation: pass these pairs to the second
    forward call so the model flips heatmaps back before returning them.
head_config (`Sapiens2HeadConfig`, *optional*):
    Configuration for the decode head. See [`Sapiens2HeadConfig`] for the available options.
sapiens2   r(   i   hidden_sizei   intermediate_size   num_hidden_layersnum_attention_headssilu
hidden_actg        attention_dropoutg{Gz?initializer_rangeg      Y@
rope_theta   r'   r   num_channelsT
query_biaskey_bias
value_bias	proj_biasmlp_biasg      ?layerscale_valuedrop_path_rateuse_gated_mlp   num_register_tokensNpos_embed_shiftpos_embed_jitterg       @pos_embed_rescale_out_features_out_indicesreshape_hidden_statesr   Fuse_mask_tokengư>rms_norm_epsnormalize_backbone_outputsuse_qk_normnum_key_value_heads_per_layernum_key_value_attention_headsnum_first_full_attention_layersnum_last_full_attention_layers   semantic_loss_ignore_index
flip_pairsc                   > U R                   cl  [        U R                  5       Vs/ s HF  nX R                  :  d  X R                  U R                  -
  :  a  U R
                  OU R                  PMH     snU l         [        U R                  [        5      (       a  [        S0 U R                  D6U l        U R                  b.  U R                  R                  U R                  U R                  S9  S/[        SU R                  S-   5       Vs/ s H  nSU 3PM
     sn-   U l        U R                  UR!                  SS 5      UR!                  SS 5      S9  ["        TU ]H  " S0 UD6  g s  snf s  snf )	N)r'   r(   stemr   stageout_indicesout_features)rv   rw   r   )rl   rangerO   rn   ro   rP   rm   r-   r   dictr
   r9   r'   r(   stage_names"set_output_features_output_indicespopr   r    )r!   r"   layer_indexir#   s       r$   r    Sapiens2Config.__post_init__   s^   --5 $))?)?#@2 $AK	  "F"FF"&<&<t?b?b&bb ((
 778 $A2D. d&&--1ED4D4DED'99T__aeapap9q"8E!TE[E[^_E_<`&a<`qqc{<`&aa//

=$7fjjQ_aeFf 	0 	
 	''#2 'bs   AE&E+)r   rl   rz   )8r;   r<   r=   r>   r?   r@   r(   r0   r.   r/   rB   rL   rM   rO   rP   rR   strrS   floatrT   rU   r'   rW   rX   rC   rY   rZ   r[   r\   r]   r^   r_   ra   rb   rc   rd   re   rf   rg   r
   sub_configsrh   ri   rj   rk   rl   rm   rn   ro   rq   rr   r   ry   r    rD   rE   rF   s   @r$   rH   rH   r   s   =~ J46Jd3i%S/16K!s!s!!J%(us{(#u#J47Jd3i%S/17L#JHdJItHd!e!"%NECK%M4  $(OUT\(%)edl)&)ut|)&*M49t#*%)L$s)d")"&4& "45K ND L%'++K6:!49t#3:)*!3*+,#S,*+"C+&)))-JT#Y$&-48K#d*T18( (r&   rH   N)huggingface_hub.dataclassesr   backbone_utilsr   configuration_utilsr   utilsr   r
   rH   __all__r   r&   r$   <module>r      s   & / 1 3 # 78Sj) Sj  9Sjl <=}((*: }(  >}(@ 1
2r&   