
    3j              	       R   S SK r S SKJr  S SKJr  S SKJr  S SKrS SKJ	r	J
r
  S SKJrJrJrJrJrJrJrJrJrJrJrJrJr  / SQr\" SS	5      r " S
 S\R6                  5      rS\R:                  4S\R6                  S\S\S\4S jjr  " S S\5      r! " S S\5      r"Sq#S r$Sq%S r&g)    N)
namedtuple)Callable)Any))sparse_semi_structured_from_dense_cutlass'sparse_semi_structured_to_dense_cutlass)fallback_dispatchersemi_sparse_addmmsemi_sparse_clonesemi_sparse_detachsemi_sparse_indicessemi_sparse_linearsemi_sparse_mmsemi_sparse_scaled_mmsemi_sparse_tsemi_sparse_tosemi_sparse_to_copysemi_sparse_valuessemi_sparse_view)SparseSemiStructuredTensor!SparseSemiStructuredTensorCUTLASS$SparseSemiStructuredTensorCUSPARSELTto_sparse_semi_structured_SEMI_STRUCTURED_SPARSE_CONFIGz=sparse_min_rows sparse_min_cols dense_min_rows dense_min_colsc                      \ rS rSr% SrSr\\S'   \\	R                  \4   \S'   Sr\\S'   Sr\\S'   Sr\\S	'   \\S
'   \\\4   \S'   \	R$                  S-  \S'   \	R$                  S-  \S'   \	R$                  S-  \S'   \	R$                  S-  \S'   \	R$                  S-  \S'   \\S'   \\S'   / SQr\   S)S\	R*                  S\	R$                  S-  S\	R$                  S-  S\	R$                  S-  S\	R$                  S-  S\	R$                  S-  S\S\S\4S jj5       rS\4S jrS\\\   \\	R*                  \\\4   4   4S jr\S\\	R*                  \\\4   S\	R$                  4S j5       r\	R:                  R<                  r\S\ 4S j5       r!\S*S+S jj5       r"\S\	R$                  SS4S  j5       r#S! r$\\4S\	R$                  S"\SS 4S# jj5       r%SS$.S%\	R$                  S&\	R$                  S-  S\	R$                  4S' jjr&S(r'g),r   *   a  
This class implements semi-structured sparsity as a Tensor subclass.

Semi-structured sparsity describes a sparsity pattern where n in every 2n elements are sparse,
depending on the datatype. It is also referred to as 2:4 sparsity or fine-grained
structured sparsity.

There are two backends available for semi_structred sparsity, either cuSPARSELt or CUTLASS.
This class is meant to serve as a base class for both implementations. SparseSemiStructuredCUTLASS
and SparseSemiStructuredCUSPARSELT both inherit from this class and define three backend-specific items.
Note that as such, this class cannot be instantiated directly.

-`_DTYPE_SHAPE_CONSTRAINTS` - A dictionary holding backend specific dense/sparse min shape constraints
- `def from_dense()` - backend specific compression routines
- `def _mm()` - backend specific mm op (either torch._cslt_sparse_mm or torch._sparse_semi_structured_(mm|addmm))
r   _DEFAULT_ALG_ID_DTYPE_SHAPE_CONSTRAINTSF_FORCE_CUTLASS_FUSE_TRANSPOSE_PROTOTYPE_WARNING_SHOWNBACKENDSPARSE_DISPATCHNpackedmetapacked_tmeta_tcompressed_swizzled_bitmaskfuse_transpose_cusparseltalg_id_cusparselt)r#   r$   r%   r&   r'   shaperequires_gradc
           	         U R                   (       dP  [        R                  " S[        SS9  SU l         U R	                  5         [
        R                  R                  U 5        Ub  Un
OUb  Un
O[        S5      e[
        R                  R                  U UU
R                  U
R                  U
R                  U	S9nX+l        X;l        XKl        X[l        Xkl        X{l        Xl        U$ )a  
Create a new instance of the tensor subclass from the compressed sparse representation.

We have the option to create the subclass with the compressed representations of both X and X', for training.
For inference, we only need a single representation (either X or X'), while the corresponding other set will be None.

Depending on the backend selected, certain fields will be set to None. (CUSPARSELT vs CUTLASS)

Args:
    shape: The shape of the original dense tensor
    packed: The compressed representation of the original dense tensor
    meta: The metadata of the original dense tensor, if it is stored separately
    packed_t: The compressed representation of the transposed original dense tensor
    meta_t: The metadata of the transposed original dense tensor, if it is stored separately
    compressed_swizzled_bitmask: The masks used by the CUTLASS backend to determine which threads should
                                 participate in the computation. Used for pointwise ops.
    fuse_transpose_cusparselt: When running with cuSPARSELt, we have the option to fuse a transposition
                               with a matmul, which is useful in the case of 2:4 sparse training.
    alg_id_cusparselt: The algorithm id to use when using cuSPARSELT, will have effect on performance

Returns:
    torch.Tensor: A torch.Tensor wrapper subclass.

Raises:
    ValueError: If all of the tensor arguments are None.
zThe PyTorch API of SparseSemiStructuredTensor is in prototype stage and will change in the near future. Please open a Github issue for features requests and see our documentation on the torch.sparse module for further information about the project.   
stacklevelTz3At least one of packed or packed_t must be provided)devicedtypelayoutr+   )r    warningswarnUserWarning_load_dispatch_tabletorch_dynamoallow_in_graph
ValueErrorTensor_make_wrapper_subclassr0   r1   r2   r#   r$   r%   r&   r'   r(   r)   )clsr*   r#   r$   r%   r&   r'   r(   r)   r+   previous_tensortensors               V/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/sparse/semi_structured.py__new__"SparseSemiStructuredTensor.__new__O   s    N ++MMH
 	 ,0C(
 $$& MM((-$O!&ORSS44"))!''"))' 5 
 "-H*+D(#4     returnc                     [        U S5      (       d  [        S5      eU R                  R                   SU R                   S3$ )Nr*   ztensor has no shape attributez(shape=))hasattrAssertionError	__class____name__r*   )selfs    r@   __repr__#SparseSemiStructuredTensor.__repr__   s=    tW%% !@AA..))*'$**Q??rC   c                    ^  [        [        U 4S jT R                  5      5      nT R                  T R                  T R
                  T R                  4nX4$ )Nc                     > [        TU 5      S L$ N)getattr)xrK   s    r@   <lambda>?SparseSemiStructuredTensor.__tensor_flatten__.<locals>.<lambda>   s    WT1-T9rC   )listfilter	__slots__r*   r(   r)   r+   )rK   inner_tensorstensor_metas   `  r@   __tensor_flatten__-SparseSemiStructuredTensor.__tensor_flatten__   sT     94>>J
 JJ**""	
 ))rC   rY   c                     Uu  pVpxU " UUR                  SS 5      UR                  SS 5      UR                  SS 5      UR                  SS 5      UR                  SS 5      UUUS9	$ )Nr#   r$   r%   r&   r'   	r*   r#   r$   r%   r&   r'   r(   r)   r+   )get)	r=   rX   rY   
outer_sizeouter_strider*   r(   r)   r+   s	            r@   __tensor_unflatten__/SparseSemiStructuredTensor.__tensor_unflatten__   s     NYJ*; $$Xt4""640"&&z48 $$Xt4(5(9(9-t) '@/'
 	
rC   c                     UR                   U R                  ;  a%  [        U R                   SUR                   S35      eU R                  UR                      " XX45      $ )NzI only supports a specific set of operations, can't perform requested op (rF   )_overloadpacketr"   NotImplementedErrorrJ   )r=   functypesargskwargss        r@   __torch_dispatch__-SparseSemiStructuredTensor.__torch_dispatch__   sb    s':'::%<<. !//3}}oQ@  ""4#7#78dSSrC   c                     [        U SS5      Gc  [        R                  R                  R                  [
        [        R                  R                  R                  [        [        R                  R                  R                  [        [        R                  R                  R                  [        [        R                  R                  R                  [        [        R                  R                  R                  [        [        R                  R                  R                  [         [        R                  R                  R"                  [$        [        R                  R                  R&                  [$        [        R                  R                  R(                  [*        [        R                  R                  R,                  [.        [        R                  R                  R0                  [2        [        R                  R                  R4                  [6        [        R                  R                  R8                  [:        [        R                  R                  R<                  [>        0U l         Ub  U R@                  RC                  U5        ggg)zD
Loads the op overload sparse dispatch table for the current class.
r"   N)"rQ   r7   opsatenvaluesr   indicesr   is_same_sizer   detach_detachr   tr   viewr   mmr   matmuladdmmr	   linearr   _to_copyr   
_scaled_mmr   cloner
   tor   r"   update)r=   custom_dispatch_tables     r@   r6   /SparseSemiStructuredTensor._load_dispatch_table   sm   
 3)408		%%'9		&&(;		++-@		&&(;		%%'9		  -		##%5		!!>		%%~		$$&7		%%'9		'')<		))+@		$$&7		!!>#C" %0##**+@A 1% 9rC   original_tensorc           	         UR                   (       d  [        SUR                   S35      eUR                  5       S:w  a  [        SUR                  5        S35      eUR	                  5       (       d  [        S5      eUR
                  U R                  ;  a  [        SUR
                   SU  S	35      eUR                  u  p#U R                  UR
                     R                  nU R                  UR
                     R                  nX$:  d  X$-  (       d  X5:  d	  X5-  (       a  [        S
UR                   SU SU S35      eg)zO
Assert that the given tensor is valid for semi-structured sparse compression.
zError original_tensor.device= z= is not supported! Only CUDA tensors are currently supported.r-   zError original_tensor.dim = z; is not supported! Only 2d tensors are currently supported.zXError original_tensor is not contiguous!Only contiguous tensors are currently supported.zError original_tensor.dtype z is not a supported dtype for !zError original_tensor.shape zS is not supported! Both dimensions must be larger or equal than and a multiple of (z, rF   N)
is_cudaRuntimeErrorr0   dimis_contiguousr1   r   r*   sparse_min_rowssparse_min_cols)r=   r   mnmin_rowsmin_colss         r@    _validate_device_dim_dtype_shape;SparseSemiStructuredTensor._validate_device_dim_dtype_shape   sk    &&01G1G0H I= =   A%./B/B/D.E F; ;  ,,..C    (D(DD./D/D.EEcdgchhij 
 $$//0E0EFVV//0E0EFVV<1<1<1<./D/D.E FSS[R\\^_g^hhik  <HrC   c           	          U R                   S   n[        R                  " U [        R                  " XR                  U R
                  S95      $ )Nr1   r0   )r*   r7   rv   eyer1   r0   )rK   cols     r@   to_dense#SparseSemiStructuredTensor.to_dense  s3    jjnxxeii::dkkRSSrC   alg_idc                     [         erP   re   r=   r   r   s      r@   
from_dense%SparseSemiStructuredTensor.from_dense#  s
     "!rC   )biasBr   c                    [         erP   r   )rK   r   r   ri   s       r@   _mmSparseSemiStructuredTensor._mm+  s
     "!rC    )Fr   FrP   )rD   N)(rJ   
__module____qualname____firstlineno____doc__r   int__annotations__dictr7   r1   r   r   boolr   r    strr   r;   rW   staticmethodSizerA   rL   tuplerU   rZ   classmethodra   _C_disabled_torch_function_impl__torch_function__r   rj   r6   r   r   r   r   __static_attributes__r   rC   r@   r   r   *   s   " OS"5;;0N#NOO ND !OT!%*d*L(H,--LL4
,,
llT!!LL4!&!44##WI +0!"#RzzR t#R llT!	R
 ,,%R t#R &+\\D%8R $(R R R Rh@# @
*	tCy%

D#t ;<<	=* 
 5::tS$67
 

 
. ??Tc T T B B2 (u|| (PT ( (TT  &"" " 
&	" " %)	"<<" llT!	" 
" "rC   r   Fr   
transposedr   rD   c                     U(       a  [         R                  " S[        SS9  [        R                  (       a  [
        R                  R                  O[
        R                  R                  nUR                  XS9$ )a	  
This function converts a dense tensor into a sparse semi-structured tensor.
It will return a SparseSemiStructuredTensor, a subclass of torch.Tensor.

This function will check to ensure the dense tensor has the right dtype, size, dims, and device.
We currently only support semi-structured sparse tensors for 2d CUDA tensors.
Additionally, your tensor must be a positive multiple of the minimum sparse block size, given in
`_DTYPE_TO_SHAPE_CONSTRAINTS` for each dtype (float32, float16, bfloat16, int8).

Args:
    original_tensor (Tensor): the dense tensor to convert
    transposed (bool, optional): deprecated arg to be removed in another release. Do not use.
    alg_id (int, optional): the algorithm id to use for cuSPARSELt matmul. Defaults to 0.
        Can be obtained via ``torch._cslt_sparse_mm_search``.
Returns:
    SparseSemiStructuredTensor: A sparse semi-structured tensor created from the given original_tensor
Raises:
    None
Example:
    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
    >>> A = torch.Tensor([0, 0, 1, 1]).tile((128, 32)).half().cuda()
    tensor([[0., 0., 1.,  ..., 0., 1., 1.],
            [0., 0., 1.,  ..., 0., 1., 1.],
            [0., 0., 1.,  ..., 0., 1., 1.],
            ...,
            [0., 0., 1.,  ..., 0., 1., 1.],
            [0., 0., 1.,  ..., 0., 1., 1.],
            [0., 0., 1.,  ..., 0., 1., 1.]], device='cuda:0', dtype=torch.float16)
    >>> A_sparse = to_sparse_semi_structured(A)
    SparseSemiStructuredTensor(shape=torch.Size([128, 128]))
    >>> A_sparse.values()
    tensor([[1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            ...,
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.],
            [1., 1., 1.,  ..., 1., 1., 1.]], device='cuda:0', dtype=torch.float16),
    >>> A_sparse.indices()
    tensor([[-4370, -4370, -4370,  ..., -4370, -4370, -4370],
            [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
            [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
            ...,
            [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
            [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
            [-4370, -4370, -4370,  ..., -4370, -4370, -4370]], device='cuda:0', dtype=torch.int16))
zSetting transpose from `to_sparse_semi_structured` is deprecated and will be removed in a future release. `SparseSemiStructuredTensor` only support contiguous input tensors.r-   r.   )r   )
r3   r4   FutureWarningr   r   r7   sparser   r   r   )r   r   r   SPARSE_SUBCLASSs       r@   r   r   5  sd    h R 	
 &44 	66\\>>  %%o%EErC   c                     ^  \ rS rSrSrSr\R                  \" SSSS5      \R                  \" SSSS5      \R                  \" SSSS5      \R                  \" SSS	S	5      0r\\R                  4S
\R                   S\SS 4S jj5       rU 4S jr\ SS
\R                   SS4S jj5       rSSS.S\R                   S\R                   S-  S\S\R                   4S jjrSrU =r$ )r   i|  a  
This class implements semi-structured sparsity for the CUTLASS backend.


In this implementation, the specified elements and metadata are stored separately,
in packed and meta respectively.

When _FORCE_CUTLASS is set, or when cuSPARSELt is not available, this subclass calls into _sparse_semi_structured_(mm|addmm) and
sparse_semi_structured_from_dense for conversion to the compressed format.
cutlass          @         r   r   rD   c           
          U R                  U5        [        U5      u  nnU " UR                  UUS S S UR                  S9$ )Nr#   r$   r%   r&   r'   r+   )r   r   r*   r+   )r=   r   r   sparse_tensor_cutlassmeta_tensor_cutlasss        r@   r   ,SparseSemiStructuredTensorCUTLASS.from_dense  sW     	,,_= 6oF	
! !!($(,)77
 	
rC   c                    > U R                   b  U R                  c  [        S5      eU R                   R                  S:X  a   [	        U R                  U R                   5      $ [
        TU ]  5       $ )Nz meta and packed must not be Noner-   )r$   r#   rH   ndimr   superr   )rK   rI   s    r@   r   *SparseSemiStructuredTensorCUTLASS.to_dense  sc    99 3 !CDD yy~~"	 4			
 !#	
rC   r   c           
      d    [         R                  " XSS9u  nnnnnU " UR                  UUUUUSS9$ )aF  
This function takes in a unpruned dense tensor and runs a (branchless) static sort across a 4x4 tile.

It greedily picks the largest values in the tile, upholding the 2:4 sparsity constraint across both rows and columns.
The algorithm used to prune the matrix is implemented in `_sparse_semi_structured_tile`.

Then it creates the packed and meta tensors for the compressed sparse representation of the pruned dense tensor.
It also calculates the packed_t and meta_t tensors for the compressed sparse representation of the transposed
pruned dense tensor.
Since we cannot transpose the compressed representations, we store both for the fw/bw pass respectively.

Finally, this function also computes a compressed swizzled bitmask that encodes the sparsity pattern
This can be used in the backward pass to mask the gradients.

[9 1 7 4]                       [9 0 7 0]
[1 2 3 0]                       [0 2 0 0]
[8 3 5 4] -> prune 4x4 tile  -> [8 0 0 4] -> pack to CUTLASS semi-structured -> packed
[1 2 6 2]                       [0 0 6 2]                                    -> metadata

                                          -> pack to transposed CUTLASS      -> packed_t
                                             semi-structured representation  -> metadata_t

                                          -> compute swizzled bitmask        -> compressed_swizzled_bitmask


The equivalent PyTorch code to create the same five outputs from the dense tensor can be found below:
```
from torch.sparse import SparseSemiStructuredTensorCUTLASS
from torch.sparse._semi_structured_conversions import (
    _sparse_semi_structured_tile,
    _compute_compressed_swizzled_bitmask,
)

pruned = _sparse_semi_structured_tile(dense)
packed_cutlass, meta_cutlass = sparse_semi_structured_from_dense_cutlass(pruned)
packed_t_cutlass, meta_t_cutlass = sparse_semi_structured_from_dense_cutlass(
    pruned.t().contiguous()
)
bitmask = _compute_compressed_swizzled_bitmask(pruned)

SparseSemiStructuredTensorCUTLASS(
    dense.shape,
    packed_cutlass,
    meta_cutlass,
    packed_t_cutlass,
    meta_t_cutlass,
    bitmask,
)
```
T	algorithmuse_cutlassFr   )r7   _sparse_semi_structured_tiler*   r=   r   r   r#   r$   r%   r&   r'   s           r@   prune_dense_static_sort9SparseSemiStructuredTensorCUTLASS.prune_dense_static_sort  sV    z ..d
	
' !!(C
 	
rC   NFr   should_transpose_denser   r   r   c          
      .   [        U[        5      (       a  [        S5      eU R                  R                  nU R
                  S:w  d  UR
                  S:w  a  [        SU S35      eU R                  b  U R                  c  [        SU S35      e[        5         U R                  UR                     n[        R                  R                  R                  UU R                  U R                  UU R                   S   UR"                  UR$                  U5      $ )NZ`SparseSemiStructuredTensor @ SparseSemiStructuredTensor` is not supported by the hardwarer-   `)` matmul: Broadcasting is not implemented$` matmul: operation is not supportedr   )
isinstancer   r:   rI   rJ   r   re   r#   r$   _ensure_cutlass_mm_registeredr   r1   r7   rm   semi_structured
cutlass_mmr*   dense_min_rowsdense_min_cols)rK   r   r   r   ri   cls_nameconstraintss          r@   r   %SparseSemiStructuredTensorCUTLASS._mm  s     a344l  >>**99>QVVq[%H:FG  ;;$))"3%H:AB  *+77@K99,,77		

1****&	 	rC   r    )rJ   r   r   r   r   r!   r7   int8r   float16bfloat16float32r   r   r   r   r;   r   r   r   r   r   r   r   __classcell__)rI   s   @r@   r   r   |  s(   	 G

22sBC5b"aC6r2q!D5b"aC	   1@@

 
 
-	
 
*

 68I
#llI
	%I
 I
^ %)',!<<! llT!	!
 !%! 
! !rC   r   c                      \ rS rSrSrSr\R                  \" SSSS5      \R                  \" SSSS5      \R                  \" SSSS5      \R                  \" SSSS5      0r\\R                  4S\R                   S\S	S 4S
 jj5       r\ SS\R                   S	S4S jj5       rSSS.S\R                   S\R                   S-  S\S	\R                   4S jjrSrg)r   i"  ab  
The cuSPARSELt backend expects the specified elements and the metadata to be stored in a single tensor:
packed = [ specified elements of original tensor | metadata ]
For an original tensor of size (m, k) we expect the first m * k // 2 elements to be the kept elements
The rest of the tensor is metadata. Since there is only one tensor, we only use the packed and packed_t
attributes respectively.

cuSPARSELt also supports transposition fusion, which is necessary for performant 2:4 sparse training, as well
as specifying alg_id, a config that affects the performance of the matmul depending on matmul sizes.

cusparseltr   r   r   r   r   rD   c                     U R                  U5        U " UR                  [        R                  " U5      S S S S [        R
                  UUR                  S9	$ )Nr]   )r   r*   r7   _cslt_compressr   r   r+   r   s      r@   r   /SparseSemiStructuredTensorCUSPARSELT.from_dense6  sW     	,,_=!''''8(,&@&P&P$)77

 
	
rC   r   c           
          [         R                  " XSS9u  nnnnnUR                  UR                  S   S5      nUR                  UR                  S   S5      nU " UR                  UUUUUSS9$ )am  
This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPARSELt metadata
layout and sparse matmul.

The only functional difference is that cuSPARSELt stores `metadata` and `packed` together into a single tensor.

[9 1 7 4]                       [9 0 7 0]
[1 2 3 0]                       [0 2 0 0]
[8 3 5 4] -> prune 4x4 tile  -> [8 0 0 4] -> pack to cuSPARSELT semi-structured -> packed
[1 2 6 2]                       [0 0 6 2]

                                          -> pack to transposed cuSPARSELt      -> packed_t
                                             semi-structured representation

                                          -> compute swizzled bitmask           -> compressed_swizzled_bitmask


The equivalent PyTorch code to create the same three outputs from the dense tensor can be found below:
```
from torch.sparse import SparseSemiStructuredTensorCUSPARSELT
from torch.sparse._semi_structured_conversions import (
    _sparse_semi_structured_tile,
    _compute_compressed_swizzled_bitmask,
)

pruned = _sparse_semi_structured_tile(dense)
packed_cusparselt = torch._cslt_compress(pruned)
packed_t_cusparselt = torch._cslt_compress(pruned.t().contiguous())
bitmask = _compute_compressed_swizzled_bitmask(pruned)

SparseSemiStructuredTensorCUSPARSELT(
    dense.shape, packed_cutlass, None, packed_t_cutlass, None, bitmask
)
```
Fr   r   r      r   )r7   r   ru   r*   r   s           r@   r   <SparseSemiStructuredTensorCUSPARSELT.prune_dense_static_sortJ  s    Z ..e
	
' _2215r:==!6!6q!92> !!(C
 	
rC   NFr   r   r   r   c                T   [        U[        5      (       a  [        S5      eU R                  S:w  d  UR                  S:w  a#  [	        SU R
                  R                   S35      eUR                  U R                  :w  ai  [	        SU R
                  R                   S[        U R                  5       S[        UR                  5       SU R                   SUR                   S	35      eUb  UR                  U R                  :w  ai  [	        SU R
                  R                   S[        U R                  5       S[        UR                  5       S
U R                   SUR                   S35      eU R                  [        R                  :X  a\  [	        SU R
                  R                   S[        U R                  5       S[        UR                  5       SU R                   S3	5      eU R                  c#  [	        SU R
                  R                   S35      e[        5         U R                  UR                     n[        R                  R                   R#                  UU R                  UU R                  S   UR$                  UR&                  U R(                  U R*                  U5	      $ )Nr   r-   r   r   z` matmul: trying to do `A=z @ B=z`, with A.dtype=z and B.dtype=zH. This operation is only supported when A and B have the same data type.z + C`, with A.dtype=B.dtype=z and C.dtype=zK. This operation is only supported when A, B and C have the same data type.z`, with A.dtype=B.dtype=zO. mm is not supported for float8_e4m3fn, please use `torch._scaled_mm` instead.r   r   )r   r   r:   r   re   rI   rJ   r1   r   r*   r7   float8_e4m3fnr#    _ensure_cusparselt_mm_registeredr   rm   r   cusparselt_mmr   r   r(   r)   )rK   r   r   r   ri   r   s         r@   r   (SparseSemiStructuredTensorCUSPARSELT._mm  sz    a344l  99>QVVq[%DNN++,,UV  77djj %DNN++,,FuTZZGXFYY^_defelel_m^n o  $

|=	 BYY 
 

djj 8%DNN++,,FuTZZGXFYY^_defelel_m^n o((,

|=	 J\\  ::,,,%DNN++,,FuTZZGXFYY^_defelel_m^n o((,

| 4`` 
 ;;%DNN++,,PQ  -.77@K99,,::

1****..&&&
 
rC   r   r   )rJ   r   r   r   r   r!   r7   r   r   r   r   r   r   r   r   r   r;   r   r   r   r   r   r   r   rC   r@   r   r   "  s    	 G;BBK

22r2rB5b"aC6r2q!D	   1@@

 
 
0	
 
& 68>
#ll>
	%>
 >
H %)',4<<4 llT!	4
 !%4 
4 4rC   r   c                     [         (       a  gSq SSKJn   U " SSS9S[        R                  S	[        R                  S
[        R                  S[        R                  S-  S[
        S[
        S[
        S[        S[        R                  4S j5       nUR                  S[        R                  S	[        R                  S
[        R                  S[        R                  S-  S[
        S[
        S[
        S[        S[        R                  4S j5       ng)zLazily register the cutlass_mm custom op.

Registration is deferred to avoid importing torch.library at module load
time, since torch.sparse is imported early during ``import torch``.
NTr   	custom_opzsemi_structured::cutlass_mmr   mutates_argsdenser#   r$   r   out_featuresr   r   r   rD   c                    U R                   u  pU* U-  n
U	* U-  nU
S:g  =(       d    US:g  nU nU(       a.  [        R                  R                  R	                  U SUSU
45      nU(       a  UR                  5       OUnUc  [        R                  " XU5      nO[        R                  " X1X.5      nU(       a<  U(       a  UOU	nUS U R                  SSU5      R                  [        R                  S9$ UR                  5       $ )Nr   r   memory_format)r*   r7   nn
functionalpadrt   _sparse_semi_structured_mm_sparse_semi_structured_addmmnarrowr|   contiguous_format
contiguous)r   r#   r$   r   r   r   r   r   r   r   to_pad_mto_pad_nneed_paddense_paddedmm_inputresout_colss                    r@   r   1_ensure_cutlass_mm_registered.<locals>.cutlass_mm  s     {{B(?B(?q=1HM 88..2251h8:TUL'=<>>#<<226JC55dDSC2qHM\"1h'U%<%<=
 ~~rC   transpose_densec                     U(       a  U R                   S   OU R                   S   n[        R                  " UUU R                  U R                  S9$ Nr   r   r   r*   r7   emptyr1   r0   )	r   r#   r$   r   r   r   r   r  r  s	            r@   _cutlass_mm_fake7_ensure_cutlass_mm_registered.<locals>._cutlass_mm_fake  sB     &55;;q>%++a.{{++<<	
 	
rC   )_cutlass_mm_registeredtorch.libraryr   r7   r;   r   r   register_fake)r   r   r  s      r@   r   r     s(    !',2> ||   ll  llT!	 
       !%  
  ? > 
||

 ll
 llT!	

 
 
 
 
 

 
rC   c                     [         (       a  gSq SSKJn   U " SSS9 SS[        R                  S	[        R                  S
[        R                  S-  S[
        S[
        S[
        S[        S[
        S[        S[        R                  4S jj5       nUR                  S[        R                  S	[        R                  S
[        R                  S-  S[
        S[
        S[
        S[        S[
        S[        S[        R                  4S j5       ng)z,Lazily register the cusparselt_mm custom op.NTr   r   zsemi_structured::cusparselt_mmr   r   r   r#   r   r   r   r   fuse_transposer   r   rD   c	                    U R                   u  pU	* U-  nU
* U-  nUS:g  =(       d    US:g  nU nU(       a.  [        R                  R                  R	                  U SUSU45      nU(       a  UR                  5       OUn[        R                  " UUUUUS9nU(       a  UR                  5       nU(       a9  U(       a  U	OU
nUR                  SSU5      R                  [        R                  S9$ UR                  5       $ )Nr   )r   transpose_resultr   r   r   )r*   r7   r   r  r  rt   _cslt_sparse_mmr  r|   r  r  )r   r#   r   r   r   r   r  r   r   r   r   r  r	  r
  r  r  r  r  s                     r@   r   7_ensure_cusparselt_mm_registered.<locals>.cusparselt_mm  s     {{B(?B(?q=1HM 88..2251h8:TUL'=<>>#<##+
 %%'C2qH::aH-33#55 4   ~~rC   c	                     U(       a  U R                   S   OU R                   S   n	[        R                  " UU	U R                  U R                  S9$ r  r  )
r   r#   r   r   r   r   r  r   r   r  s
             r@   _cusparselt_mm_fake=_ensure_cusparselt_mm_registered.<locals>._cusparselt_mm_fake6  sB     &<5;;q>Q{{++<<	
 	
rC   )F)_cusparselt_mm_registeredr  r   r7   r;   r   r   r  )r   r   r!  s      r@   r   r   	  s;    !  $'/bA (-! ||! !  llT!!  	! 
 !  !  !  !  !%!  
!  B! F   
||

 llT!
 	

 
 
 
 
 !%
 

 !
rC   )'r3   collectionsr   collections.abcr   typingr   r7   )torch.sparse._semi_structured_conversionsr   r   !torch.sparse._semi_structured_opsr   r	   r
   r   r   r   r   r   r   r   r   r   r   __all__r   r;   r   r   r   r   r   r   r   r  r   r#  r   r   rC   r@   <module>r*     s     " $     " ",$C" H" H"Z ,<<DF\\DFDF DF  	DFNc(B cL]+E ]@  >
B " ?
rC   