
    3jؠ                     X   S SK Jr  S SKrS SKJr  S SKJrJr  S SKJ	r	J
r
Jr  SSKJr  SSKJr  SS	KJr  / S
Qr " S S\5      r " S S\5      r " S S\\5      r " S S\5      r " S S\\5      r " S S\5      r " S S\\5      r " S S\5      r " S S\\5      r " S S\5      rg)    )AnyN)Tensor)
functionalinit)	ParameterUninitializedBufferUninitializedParameter   )SyncBatchNorm)LazyModuleMixin)Module)BatchNorm1dLazyBatchNorm1dBatchNorm2dLazyBatchNorm2dBatchNorm3dLazyBatchNorm3dr   c                      ^  \ rS rSr% SrSr/ SQr\\S'   \	\S'   \	S-  \S'   \
\S	'   \
\S
'         SSS.S\S\	S\	S-  S	\
S
\
S\
SS4U 4S jjjjrSS jrSS jrS rS r  SU 4S jjrSrU =r$ )	_NormBase   z,Common base of _InstanceNorm and _BatchNorm.   )track_running_statsmomentumepsnum_featuresaffiner   r   Nr   r   r   Tbiasr   returnc                  > XgS.n	[         TU ]  5         Xl        X l        X0l        X@l        XPl        U R
                  (       ae  [        [        R                  " U40 U	D65      U l
        U(       a&  [        [        R                  " U40 U	D65      U l        O7U R                  SS 5        O$U R                  SS 5        U R                  SS 5        U R                  (       a  U R                  S[        R                  " U40 U	D65        U R                  S[        R                  " U40 U	D65        U   U   U R                  S[        R                   "  SS[        R"                  0U	R%                  5        V
Vs0 s H  u  pU
S:w  d  M  X_M     snn
D65        U   O6U R                  SS 5        U R                  SS 5        U R                  SS 5        U R'                  5         g s  snn
f )	Ndevicedtyper   weightrunning_meanrunning_varnum_batches_trackedr#   r   )super__init__r   r   r   r   r   r   torchemptyr$   r   register_parameterregister_bufferzerosonestensorlongitemsreset_parameters)selfr   r   r   r   r   r"   r#   r   factory_kwargskv	__class__s               T/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/nn/modules/batchnorm.pyr*   _NormBase.__init__&   s    %+;( #6 ;;#EKK$O$OPDK%ekk,&Q.&QR	''5##Hd3##FD1##  L KN K   uzz,I.I   %** )7(<(<(>O(>!w,tqt(>O	   6  5  !6= Ps   1GGc                     U R                   (       aP  U R                  R                  5         U R                  R	                  S5        U R
                  R                  5         g g )Nr
   )r   r%   zero_r&   fill_r'   r5   s    r:   reset_running_stats_NormBase.reset_running_stats[   sJ    ## ##%""1%$$**, $    c                     U R                  5         U R                  (       aO  [        R                  " U R                  5        U R
                  b!  [        R                  " U R
                  5        g g g N)r@   r   r   ones_r$   r   zeros_r?   s    r:   r4   _NormBase.reset_parametersc   sI      ";;JJt{{#yy$DII& % rB   c                     [         erD   )NotImplementedErrorr5   inputs     r:   _check_input_dim_NormBase._check_input_dimj   s    !!rB   c                 Z    SR                   " S0 U R                  DSU R                  S L0D6$ )Nz{{num_features}, eps={eps}, momentum={momentum}, affine={affine}, bias={use_bias}, track_running_stats={track_running_stats}use_bias )format__dict__r   r?   s    r:   
extra_repr_NormBase.extra_reprm   s:    IIOP --*.))4*?	
rB   c           	      t  > UR                  SS 5      nUb  US:  a  U R                  (       av  US-   n	X;  al  U R                  b:  U R                  R                  [        R                  " S5      :w  a  U R                  O"[        R
                  " S[        R                  S9X'   [        T
U ]!  UUUUUUU5        g )Nversionr   r'   metar   )r#   )	getr   r'   r"   r+   r1   r2   r)   _load_from_state_dict)r5   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsrV   num_batches_tracked_keyr9   s             r:   rY   _NormBase._load_from_state_dictu   s     !$$Y5Ow{0H0H '-/D&D#&8 //;00775<<;OO ,, auzz:	 3 	%	
rB   )r   r   r   r   r   r   r$   h㈵>皙?TTNNr   N)__name__
__module____qualname____firstlineno____doc___version__constants__int__annotations__floatboolr*   r@   r4   rL   rS   rY   __static_attributes____classcell__r9   s   @r:   r   r      s    6HXM	JdlL !$$(3  3 3  3  $,	3 
 3  "3  3  
3  3 j-'"
 
 
 
  
rB   r   c                   x   ^  \ rS rSr      SSS.S\S\S\S-  S\S	\S
\SS4U 4S jjjjrS\S\4S jr	Sr
U =r$ )
_BatchNorm   TNr   r   r   r   r   r   r   r   c                @   > XgS.n	[         T
U ]  " UUUUU40 U	DSU0D6  g Nr!   r   )r)   r*   )r5   r   r   r   r   r   r"   r#   r   r6   r9   s             r:   r*   _BatchNorm.__init__   s=     %+;	
 	
 	
rB   rK   c           
         U R                  U5        U R                  c  SnOU R                  nU R                  (       ak  U R                  (       aZ  U R                  bM  U R                  R                  S5        U R                  c  S[        U R                  5      -  nOU R                  n U R                  (       a  SnO#U R                  S L =(       a    U R                  S L n [        R                  " UU R                  (       a  U R                  (       a  U R                  OS U R                  (       a  U R                  (       a  U R                  OS U R                  U R                  UUU R                  5      $ )N        r
         ?T)rL   r   trainingr   r'   add_rp   r%   r&   F
batch_normr$   r   r   )r5   rK   exponential_average_factorbn_trainings       r:   forward_BatchNorm.forward   s'   e$
 == ),&)-&==T55''3((--a0==(14uT=U=U7V1V.15.	 ==K,,4T4;K;Kt;SK	
 || }}(@(@ !!$(MMT5M5MDSWKKII&HH
 	
rB   rP   rc   )rg   rh   ri   rj   rn   rp   rq   r*   r   r   rr   rs   rt   s   @r:   rv   rv      s     !$$(
 

 
 $,	

 
 "
 
 

 
.0
V 0
 0
 0
rB   rv   c                   v   ^  \ rS rSr% \\S'   \\S'         S
SS. SU 4S jjjjrSU 4S jjrSS jrS	r	U =r
$ )_LazyNormBase   r$   r   Tr   c          
        > XVS.n[         TU ]  " SUUSS40 UDSS0D6  X0l        X@l        U R                  (       a'  [	        S0 UD6U l        U(       a  [	        S0 UD6U l        U R                  (       ax  [        S0 UD6U l        [        S0 UD6U l	        [        R                  "  SS[        R                  0UR                  5        V	V
s0 s H  u  pU	S:w  d  M  X_M     sn
n	D6U l        g g s  sn
n	f )Nr!   r   Fr   r#   rP   r(   )r)   r*   r   r   r	   r$   r   r   r%   r&   r+   r1   r2   r3   r'   )r5   r   r   r   r   r"   r#   r   r6   r7   r8   r9   s              r:   r*   _LazyNormBase.__init__   s     %+; 
	
 
	
 
	
 #6 ;;0B>BDK2D^D	## 3 En ED2D^DD',||(jj( %3$8$8$:K$:DAa7l414$:K	(D$ $ Ls   
C-C-c                 p   > U R                  5       (       d   U R                  S:w  a  [        TU ]  5         g g g )Nr   )has_uninitialized_paramsr   r)   r4   )r5   r9   s    r:   r4   _LazyNormBase.reset_parameters  s3    ,,..43D3D3IG$& 4J.rB   c                    U R                  5       (       Ga@  UR                  S   U l        U R                  (       a  [	        U R
                  [        5      (       d  [        S5      eU R
                  R                  U R                  45        U R                  bP  [	        U R                  [        5      (       d  [        S5      eU R                  R                  U R                  45        U R                  (       aL  U R                  R                  U R                  45        U R                  R                  U R                  45        U R                  5         g g )Nr
   z-self.weight must be an UninitializedParameterz+self.bias must be an UninitializedParameter)r   shaper   r   
isinstancer$   r	   AssertionErrormaterializer   r   r%   r&   r4   rJ   s     r:   initialize_parameters#_LazyNormBase.initialize_parameters  s   ((** %AD{{!$++/EFF(G  ''):):(<=99(%dii1GHH,I  II))4+<+<*>?''!!--&&(   ,,&&( !!#+ +rB   )r   r   r'   r   r%   r&   r   r$   rc   rf   )rg   rh   ri   rj   r	   ro   r*   r4   r   rr   rs   rt   s   @r:   r   r      sQ    ""
    * * 
* *X'
$ $rB   r   c                   "    \ rS rSrSrSS jrSrg)r   i2  a  Applies Batch Normalization over a 2D or 3D input.

Method described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the number of features or channels of the input). By default, the
elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
At train time in the forward pass, the variance is calculated via the biased estimator,
equivalent to ``torch.var(input, correction=0)``. However, the value stored in the
moving average of the variance is calculated via the unbiased  estimator, equivalent to
``torch.var(input, correction=1)``.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

Args:
    num_features: number of features or channels :math:`C` of the input
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
    bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
        :attr:`affine` is ``True``). Default: ``True``

Shape:
    - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
      :math:`C` is the number of features or channels, and :math:`L` is the sequence length
    - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

Examples::

    >>> # With Learnable Parameters
    >>> m = nn.BatchNorm1d(100)
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm1d(100, affine=False)
    >>> input = torch.randn(20, 100)
    >>> output = m(input)
Nc                     UR                  5       S:w  a2  UR                  5       S:w  a  [        SUR                  5        S35      eg g Nr      zexpected 2D or 3D input (got D input)dim
ValueErrorrJ   s     r:   rL   BatchNorm1d._check_input_dim{  @    99;!		q 0<UYY[MRSS !1rB   rP   rf   rg   rh   ri   rj   rk   rL   rr   rP   rB   r:   r   r   2  s    FPTrB   r   c                   &    \ rS rSrSr\rSS jrSrg)r   i  a  A :class:`torch.nn.BatchNorm1d` module with lazy initialization.

Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.

Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.

Args:
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
    bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
        :attr:`affine` is ``True``). Default: ``True``
Nc                     UR                  5       S:w  a2  UR                  5       S:w  a  [        SUR                  5        S35      eg g r   r   rJ   s     r:   rL    LazyBatchNorm1d._check_input_dim  r   rB   rP   rf   )	rg   rh   ri   rj   rk   r   cls_to_becomerL   rr   rP   rB   r:   r   r     s    8  MTrB   r   c                   "    \ rS rSrSrSS jrSrg)r   i  aG  Applies Batch Normalization over a 4D input.

4D is a mini-batch of 2D inputs
with additional channel dimension. Method described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
standard-deviation is calculated via the biased estimator, equivalent to
``torch.var(input, correction=0)``. However, the value stored in the moving average of the
standard-deviation is calculated via the unbiased  estimator, equivalent to
``torch.var(input, correction=1)``.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

Args:
    num_features: :math:`C` from an expected input of size
        :math:`(N, C, H, W)`
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
    bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
        :attr:`affine` is ``True``). Default: ``True``

Shape:
    - Input: :math:`(N, C, H, W)`
    - Output: :math:`(N, C, H, W)` (same shape as input)

Examples::

    >>> # With Learnable Parameters
    >>> m = nn.BatchNorm2d(100)
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm2d(100, affine=False)
    >>> input = torch.randn(20, 100, 35, 45)
    >>> output = m(input)
Nc                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg N   zexpected 4D input (got r   r   rJ   s     r:   rL   BatchNorm2d._check_input_dim  0    99;!6uyy{m8LMM rB   rP   rf   r   rP   rB   r:   r   r         GRNrB   r   c                   &    \ rS rSrSr\rSS jrSrg)r   i  a  A :class:`torch.nn.BatchNorm2d` module with lazy initialization.

Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.

Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.

Args:
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
    bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
        :attr:`affine` is ``True``). Default: ``True``
Nc                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg r   r   rJ   s     r:   rL    LazyBatchNorm2d._check_input_dim  r   rB   rP   rf   )	rg   rh   ri   rj   rk   r   r   rL   rr   rP   rB   r:   r   r         8  MNrB   r   c                   "    \ rS rSrSrSS jrSrg)r   i  a|  Applies Batch Normalization over a 5D input.

5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
standard-deviation is calculated via the biased estimator, equivalent to
``torch.var(input, correction=0)``. However, the value stored in the moving average of the
standard-deviation is calculated via the unbiased  estimator, equivalent to
``torch.var(input, correction=1)``.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
or Spatio-temporal Batch Normalization.

Args:
    num_features: :math:`C` from an expected input of size
        :math:`(N, C, D, H, W)`
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
    bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
        :attr:`affine` is ``True``). Default: ``True``

Shape:
    - Input: :math:`(N, C, D, H, W)`
    - Output: :math:`(N, C, D, H, W)` (same shape as input)

Examples::

    >>> # With Learnable Parameters
    >>> m = nn.BatchNorm3d(100)
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm3d(100, affine=False)
    >>> input = torch.randn(20, 100, 35, 45, 10)
    >>> output = m(input)
Nc                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg N   zexpected 5D input (got r   r   rJ   s     r:   rL   BatchNorm3d._check_input_dima  r   rB   rP   rf   r   rP   rB   r:   r   r     r   rB   r   c                   &    \ rS rSrSr\rSS jrSrg)r   if  a  A :class:`torch.nn.BatchNorm3d` module with lazy initialization.

Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.

Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.

Args:
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
    bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
        :attr:`affine` is ``True``). Default: ``True``
Nc                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg r   r   rJ   s     r:   rL    LazyBatchNorm3d._check_input_dim  r   rB   rP   rf   )	rg   rh   ri   rj   rk   r   r   rL   rr   rP   rB   r:   r   r   f  r   rB   r   c                      ^  \ rS rSrSr       SSS.S\S\S\S-  S	\S
\S\S-  S\SS4U 4S jjjjr	SS jr
SS jrS\S\4S jr\SS j5       rSrU =r$ )r   i  aN  Applies Batch Normalization over a N-Dimensional input.

The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over all
mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
are learnable parameter vectors of size `C` (where `C` is the input size).
By default, the elements of :math:`\gamma` are sampled from
:math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
The standard-deviation is calculated via the biased estimator, equivalent to
`torch.var(input, correction=0)`.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
Normalization or Spatio-temporal Batch Normalization.

Currently :class:`SyncBatchNorm` only supports
:class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
:meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
:attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
Network with DDP.

Args:
    num_features: :math:`C` from an expected input of size
        :math:`(N, C, +)`
    eps: a value added to the denominator for numerical stability.
        Default: ``1e-5``
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
    process_group: synchronization of stats happen within each process group
        individually. Default behavior is synchronization across the whole
        world
    bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
        :attr:`affine` is ``True``). Default: ``True``

Shape:
    - Input: :math:`(N, C, +)`
    - Output: :math:`(N, C, +)` (same shape as input)

.. note::
    Synchronization of batchnorm statistics occurs only while training, i.e.
    synchronization is disabled when ``model.eval()`` is set or if
    ``self.training`` is otherwise ``False``.

Examples::

    >>> # xdoctest: +SKIP
    >>> # With Learnable Parameters
    >>> m = nn.SyncBatchNorm(100)
    >>> # creating process group (optional)
    >>> # ranks is a list of int identifying rank ids.
    >>> ranks = list(range(8))
    >>> r1, r2 = ranks[:4], ranks[4:]
    >>> # Note: every rank calls into new_group for every
    >>> # process group created, even if that rank is not
    >>> # part of the group.
    >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
    >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
    >>> input = torch.randn(20, 100, 35, 45, 10)
    >>> output = m(input)

    >>> # network is nn.BatchNorm layer
    >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
    >>> # only single gpu per process is currently supported
    >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
    >>>                         sync_bn_network,
    >>>                         device_ids=[args.local_rank],
    >>>                         output_device=args.local_rank)
TNr   r   r   r   r   r   process_groupr   r   c	                L   > XxS.n
[         TU ]  " UUUUU40 U
DSU	0D6  X`l        g ry   )r)   r*   r   )r5   r   r   r   r   r   r   r"   r#   r   r6   r9   s              r:   r*   SyncBatchNorm.__init__  sE     %+;	
 	
 	
 +rB   c                 f    UR                  5       S:  a  [        SUR                  5        S35      eg )Nr   z expected at least 2D input (got r   r   rJ   s     r:   rL   SyncBatchNorm._check_input_dim  s/    99;??		}HUVV rB   c                 D    UR                  S5      S:X  a  [        S5      eg )Nr
   r   z9SyncBatchNorm number of input channels should be non-zero)sizer   rJ   s     r:   _check_non_zero_input_channels,SyncBatchNorm._check_non_zero_input_channels  s'    ::a=AK  rB   rK   c                 F   U R                  U5        U R                  U5        U R                  c  SnOU R                  nU R                  (       a{  U R                  (       aj  U R
                  c  [        S5      eU R
                  R                  S5        U R                  c  SU R
                  R                  5       -  nOU R                  n U R                  (       a  SnO#U R                  SL =(       a    U R                  SL n U R                  (       a  U R                  (       a  U R                  OSnU R                  (       a  U R                  (       a  U R                  OSnU=(       aV    U R                  =(       aC    [        R                  R                  5       =(       a    [        R                  R                  5       nU(       a  UR                  R                   SSS	[        R"                  R%                  5       4;  a*  ['        S
[        R"                  R%                  5        35      e[        R                  R(                  R*                  nU R,                  (       a  U R,                  n[        R                  R/                  U5      nUS:  nU(       d;  [0        R2                  " UUUU R4                  U R6                  UUU R8                  5      $ U(       d  [        S5      e[:        R<                  " UU R4                  U R6                  UUU R8                  UWW5	      $ )z
Runs the forward pass.
Nr|   z$num_batches_tracked must not be Noner
   r}   Tcudahpuxpuz;SyncBatchNorm expected input tensor to be on GPU or XPU or zbn_training must be True)rL   r   r   r~   r   r'   r   r   itemr%   r&   r+   distributedis_availableis_initializedr"   type_C_get_privateuse1_backend_namer   groupWORLDr   get_world_sizer   r   r$   r   r   sync_batch_normapply)	r5   rK   r   r   r%   r&   	need_syncr   
world_sizes	            r:   r   SyncBatchNorm.forward  s    	e$++E2
 == ),&)-&==T55''/$%KLL$$))!,}}$-043K3K3P3P3R-R*-1]]*	 ==K,,4T4;K;Kt;SK	 &*]]d6N6NDTX 	 %)MMT5M5MDSW 	  33!!..03 !!002	 	 ||  668	)  !Qxx==?@B 
 "--3399M!! $ 2 2**99-HJ"QI <<		*	 	 $%?@@"((		*
 
rB   c           
      L   Un[        U[        R                  R                  R                  R
                  5      (       Ga  [        R                  R                  UR                  UR                  UR                  UR                  UR                  UUR                  SLS9nUR                  (       a@  [        R                  " 5          UR                  Ul        UR                  Ul        SSS5        UR                  Ul        UR                   Ul        UR"                  Ul        UR$                  Ul        ['        US5      (       a  UR(                  Ul        UR+                  5        H%  u  pEUR-                  X@R/                  XR5      5        M'     AU$ ! , (       d  f       N= f)a  Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.

Args:
    module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
    process_group (optional): process group to scope synchronization,
        default is the whole world

Returns:
    The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
    layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
    a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
    instead.

Example::

    >>> # Network with nn.BatchNorm layer
    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
    >>> module = torch.nn.Sequential(
    >>>            torch.nn.Linear(20, 100),
    >>>            torch.nn.BatchNorm1d(100),
    >>>          ).cuda()
    >>> # creating process group (optional)
    >>> # ranks is a list of int identifying rank ids.
    >>> ranks = list(range(8))
    >>> r1, r2 = ranks[:4], ranks[4:]
    >>> # Note: every rank calls into new_group for every
    >>> # process group created, even if that rank is not
    >>> # part of the group.
    >>> # xdoctest: +SKIP("distributed")
    >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
    >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
    >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

Nr   qconfig)r   r+   nnmodules	batchnormrv   r   r   r   r   r   r   r   no_gradr$   r%   r&   r'   r~   hasattrr   named_children
add_moduleconvert_sync_batchnorm)clsmoduler   module_outputnamechilds         r:   r   $SyncBatchNorm.convert_sync_batchnormy  s@   H fehh..88CCDD!HH22##

**[[, 3 M }}]]_+1==M()/M& % *0)<)<M&(.(:(:M%060J0JM-%+__M"vy))(.%!002KD$$00F 3  %_s   #F
F#)r   )rd   re   TTNNNrf   rD   )rg   rh   ri   rj   rk   rn   rp   rq   r   r*   rL   r   r   r   classmethodr   rr   rs   rt   s   @r:   r   r     s    fV !$$($(+ ++ + $,	+
 + "+ Tz+ + 
+ +2WaV a aF = =rB   r   )typingr   r+   r   torch.nnr   r   r   torch.nn.parameterr   r   r	   
_functionsr   r   lazyr   r   r   __all__r   rv   r   r   r   r   r   r   r   rP   rB   r:   <module>r      s       * U U 8 ! |
 |
~H
 H
VL$OY L$^KT* KT\!TmZ !THLN* LN^!NmZ !NHLN* LN^!NmZ !NHmJ mrB   