
    +jo                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlZd dlmZ d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZmZ d	d
lmZmZm Z m!Z!  ej"        dd          Z# ej$        e#          %                    d          Z&dZ'dZ(dZ)dZ* G d d          Z+d Z,	 	 	 	 d#dZ-d Z. G d d          Z/d dgddfde0e1e2e1         f         de1de2e3         d e4d!e4f
d"Z5dS )$    N)stderr)OptionalDictUnion)tqdm)infer_auto_device_mapdispatch_model)get_balanced_memory)snapshot_download)	load_file	save_file   )	get_codecget_32bit_codecget_lutsencode_weightsdfloat11z
decode.ptx)pathdecodez0.5.0   )i   )encoded_exponentsign_mantissac                   ,    e Zd ZdZi Zed             ZdS )TensorManagerz
    Static utility class that manages tensor allocation and reuse
    to minimize memory allocation overhead during tensor reconstruction.
    c                    t          | t                    rt          j        |           } | t          j        v r_t          j        |          }|                                |k    r
|d|         S t          j        | = t          j                                         t          j	        |t          j
        |           }t          d| d|  t                     |t          j        | <   |S )ao  
        Get a bfloat16 tensor with at least n_elements on the specified device.

        If a tensor already exists on the device and is larger than n_elements,
        a slice of the tensor with exactly n_elements is returned. If n_elements 
        exceeds the size of the existing tensor, the existing tensor is deallocated 
        and a larger one is allocated.

        Args:
            device: The device to allocate the tensor on (e.g., 'cuda:0')
            n_elements: The exact number of elements required

        Returns:
            A bfloat16 tensor with exactly n_elements on the specified device
        Ndtypedevicez
Allocated z bf16 on device file)
isinstancestrtorchr   r   _tensorsnumelcudaempty_cacheemptybfloat16printr   )r   
n_elementsexisting_tensor
new_tensors       T/home/wildlama/comfy/ComfyUI/.venv/lib/python3.11/site-packages/dfloat11/dfloat11.pyallocate_bfloat16zTensorManager.allocate_bfloat16<   s    $ fc"" 	*\&))F ]++++4V<O $$&&*44&{
{33 &v.J""$$$ [5>&QQQ
?:??v??fMMMM *4v&    N)__name__
__module____qualname____doc__r$   staticmethodr/    r0   r.   r   r   4   s>         
 H' ' \' ' 'r0   r   c                 4     t                       fd}|S )a  
    Creates a PyTorch forward pre-hook that decodes compressed DFloat11 weights on-the-fly.
    
    This hook reconstructs full-precision weights from compressed representations
    using a custom CUDA kernel during the forward pass.
    
    Args:
        threads_per_block: CUDA thread configuration 
        bytes_per_thread: Number of bytes processed per CUDA thread
        
    Returns:
        A forward pre-hook function for PyTorch modules
    c                 v   | j         j        }t          | d          rs| j                                        D ]Y\  }}t          | |          rt          | |          j        |k    s+|                     ||                    |d                     Z| j        	                                }| j
        	                                }| j         j        d         }t                              ||          }t          t          j        |d         z  z                      f}	t"          j                            |j                  5  t+          |	| j        | j                                         | j
                                        | j                                        | j                                        | j                                        |                                |||g	           d d d            n# 1 swxY w Y   t5          | t6          j                  r&|                    | j        | j                  | _         nt5          | t6          j!                  r&|                    | j"        | j#                  | _         nZtI          j%        || j&                  }
tO          | j(        |
          D ]*\  }}|                    |j        |j                  |_         +t          | d          rM| j        )                                D ]5}t          | |          r!t          | |          }tU          | |           ~4d S d S )Noffloaded_tensorsT)non_blockingr   gridblock
shared_memargs)+lutsr   hasattrr9   itemsgetattrregister_buffertor   r%   r   shaper   r/   intmathceilcpr&   Deviceindex_decodeshared_mem_sizedata_ptroutput_positionsgapsr!   nnLinearviewout_featuresin_featuresweight	Embeddingnum_embeddingsembedding_dimr#   tensor_splitsplit_positionszipweight_injection_moduleskeysdelattr)module_r   tensor_nametensorr+   n_bytesn_lutsreconstructedblocks_per_gridweights
sub_modulerW   tmpbytes_per_threadthreads_per_blocks                 r.   decode_hookzget_hook.<locals>.decode_hookw   sf   # 6.// 	^'-'?'E'E'G'G ^ ^#VFK00^6=fk6R6R6Y]c6c6c**;		&W[	8\8\]]] )//11
)//11"1% &77
KK ty4Ea4HK[4[)\]]^^a W^^FL)) 		 		0AfNd$$&&'0022$--//'0022$$&&&&((l    		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 fbi(( 	a)..#V%7 FMM -- 	a)..%v'; FMM
 (8NOOG&)&*I7&S&S a a"
F$*KK
0GI_$`$`
!! 6.// 	%7<<>>  6;// !&+66CFK000	 	 s   B'G55G9<G9)tuple)rm   rl   rn   s   `` r.   get_hookrp   g   s<     /007 7 7 7 7 7r r0   FTc                 T
   |d         }|d         }|d         }	|sd t          j        |          D             n|g}
d}|r|dz  }|r|dz  }|dz  }t          |
|	          D ]}|s t           j                            ||          n|}t          |          }|                                D ]x\  }}||                                 v r|t          | 	                                          v r|t          | 	                                          |         }|j
        |j
        k    r|j                            |           t          d
| d|j
         d|j
         t                     t          |                                           |         }|j
        |j
        k    r|                    |           
t          d
| d|j
         d|j
         t                     5|                    d          }| }t#          |dd                   D ]A\  }}t%          ||          rt'          ||          }&t          d| t                      n|d         dk    r$t)          |d|                                           n|r||dk    r|d         t,          v rt%          |d          st)          |di            |r|                                n||j        |d         <   |5|dk    r/t3          |j                  t3          t,                    k    r|dz  }n|                    |d         |           |d         dk    rK|                    t9          ||                     |	                                D ]\  }}t;          j        |d                    |dd                             rt?          |t@          j!                  r|j"        }tG          |d           ~it?          |t@          j$                  r|j"        }tG          |d           ~t)          |dg            |D ]`}|                    d          }|}|D ]}t'          ||          }|j"        }tG          |d           ~|j%        &                    |           a|d         dk    r|'                    tP          j)                  *                                }t)          |d|d         dz  dz   |dd         |dd         z
  +                                ,                                dz  z              z| S )a  
    Loads DFloat11 compressed weights from safetensors files and configures the model
    to use them with on-the-fly decompression.
    
    Args:
        model: The PyTorch model to load weights into
        directory_path: Path to the directory containing safetensors files
        dfloat11_config: Configuration for DFloat11 compression
        
    Returns:
        The model with configured DFloat11 compression
    rm   rl   pattern_dictc                 <    g | ]}|                     d           |S ).safetensors)endswith).0fs     r.   
<listcomp>z,load_and_replace_tensors.<locals>.<listcomp>   s9       N1K1K	  r0   zLoading DFloat11 safetensorsz (offloaded to CPUz, memory pinned))desczShape mismatch for z: model z vs loaded r   .NzCannot find module path for r\   r   r9   r   r   rW   r^   rP   rN         )-oslistdirr   r   joinr   rB   
state_dictdictnamed_parametersrF   datacopy_r*   r   named_bufferssplit	enumeraterA   rC   setattrtolistoffloaded_tensor_names
pin_memoryr9   lenrD   register_forward_pre_hookrp   re	fullmatchr!   rR   rX   rW   r`   rS   r^   appendrT   r#   uint32numpymaxitem)modeldirectory_pathdfloat11_configcpu_offloadcpu_offload_blocksr   from_single_filerm   rl   rr   safetensors_filesloading_desc	file_name	file_pathloaded_tensorsrc   tensor_valueparambufferpartsra   ipartpattern
attr_namesrk   	attr_pathtargetpoutput_positions_nps                                 r.   load_and_replace_tensorsr      sJ   * ((;<'(:;'7L
 4  :n--   $2#3  2L ,, 	.--L+,??? [ [	CSbBGLL;;;Yb	 #9-- *8)=)=)?)? T	 T	%Ke..0000$u'='='?'?"@"@@@ !7!7!9!9::;GE{l&888
((6666uKuuuuamasuu  }C  D  D  D  D  D "%"5"5"7"788EF||'999\2222vKvvvvbnbtvv  ~D  E  E  E  E  E $))#..  )ss44 < <GAtvt,, !(!6!6J[JJQWXXXXRy$555(9<;N;N;P;PQQQQ& 
L,>,FJ\_`J`J`fklnfo  tJ  gJ  gJ#*63F#G#G I '0CR H H H_iB{,BYBYB[B[B[o{F4U2Y? 2 >EWZ[E[E[beflf~bb  DG  H^  D_  D_  c_  c_ 2a 7 2 #2259lKKK Ry$66688BSUe9f9fggg 4@3E3E3G3G W W/GZ!|GSXXeCRCj5I5IJJ W#-fbl#C#C !W*0-C$+FH$=$=$=(+%/	%B%B !W*0-C$+FH$=$=$=(+ %,F4NPR$S$S$S5? 	%W 	%W	090D0D1716 )H )HA5<VQ5G5GFF.4m(/(A(A(A,/(.(G(N(Nv(V(V(V(V1W2 r&888.:.?.?.M.M.S.S.U.U+"--a014q8<OPQPRPR<SVijmkmjmVn<n;s;s;u;u;z;z;|;|  @A  <A  A  aT	l Lr0   c                     g }|D ]V}|                                  D ]?\  }}t          j        ||          r%|j        j        }||vr|                    |           @W|S )a  
    Find model layer classes that should not be split across devices.
    
    This is crucial for DFloat11 model sharding to ensure compressed modules
    stay on the same device as their decompression buffers.
    
    Args:
        model: The PyTorch model
        pattern_dict: Dictionary mapping regex patterns to submodule lists
        
    Returns:
        List of class names that should not be split across devices
    )named_modulesr   r   	__class__r1   r   )r   rr   no_split_classesr   	full_namerj   
class_names          r.   get_no_split_classesr   7  s      8 8%*%8%8%:%: 	8 	8!Iz|GY// 8'1:
%555$++J777		8 r0   c                      e Zd ZdZe	 	 	 	 	 	 	 	 	 ddedee         ded	eeee	ef         ee	ef         f                  d
e
dee	         de
de
deeeee         f                  fd            Ze	 	 	 	 	 	 ddedeeee         f         dee         ded	eeee	ef         ee	ef         f                  d
e
dee	         de
fd            ZdS )DFloat11Modelz
    Wrapper class for loading and using models with DFloat11 compressed weights.
    DFloat11 is a custom 11-bit floating point format that provides memory efficiency
    while maintaining numerical accuracy for LLM weights.
    NautoFTdfloat11_model_name_or_pathr   
device_map
max_memoryr   r   r   r   rr   c           	         |	rgt           j                            |          r|}nt           j                            |          rt	          d| d          t          d| d          t           j                            |          r|}nF|                    dd          }t           j                            |          st          ||           |rz|	rdt          t          t          |
d	i}n\t          t           j                            |d
          dd          5 }t          j        |          }ddd           n# 1 swxY w Y   |}nddlm}m}m} ddlm} |                    |          } |            5   |j        |fdt0          j        i|}|                                 |                                 ddd           n# 1 swxY w Y   	 |                    |          }||_        n# t:          $ r
}Y d}~nd}~ww xY wt=          |t>                    rd|v r	|d         }n'tA          |d          r|j!        }ntE          d          tG          |||||||	           |sSd}|$                                %                                D ]}||j&        z  }tO          d|dz  ddtP                     |r|)                    |          }n|dk    s
J d            tU          ||d                   }tW          |||          }tY          |||          }t[          ||          }t]          d |/                                D                       rtO          dtP                     |S ) aU  
        Load a model with DFloat11 compressed weights from local path or Hugging Face Hub.
        
        Args:
            dfloat11_model_name_or_path: Local path or HF Hub model name
            device: Target device for the model
            device_map: Strategy for distributing model across devices
            max_memory: Maximum memory allocation per device
            bfloat16_model: Optional pre-initialized model to load weights into
            cpu_offload: Enables CPU offloading; only keeps a single block of weights in GPU at once
            cpu_offload_blocks: Number of transformer blocks to offload to CPU; if None, offload all blocks
            pin_memory: Enables memory-pinning/page-locking when using CPU offloading
            from_single_file: Whether to load a single safetensors file
            pattern_dict: Dictionary mapping regex patterns to submodule lists
            **kwargs: Additional arguments passed to AutoModelForCausalLM.from_config
            
        Returns:
            Model with DFloat11 compressed weights configured for on-the-fly decompression
        zeExpected `dfloat11_model_name_or_path` to be the path to a safetensors file, but found a directory: "".z
The file "z" does not exist./__)	local_dirr   versionrm   rl   rr   config.jsonrutf-8encodingNr   )AutoModelForCausalLM
AutoConfigGenerationConfig)no_init_weightstorch_dtypezd"dfloat11_config" not found: it is expected to be found in the config file or passed as an argument.)r   r   r   r   zTotal model size: g    eAz0.4fz GBr   r   z>device_map should be 'auto' if no specific device is provided.rr   )r   no_split_module_classesc              3   6   K   | ]}|j         j        d k    V  dS )cpuN)r   type)rv   r   s     r.   	<genexpr>z0DFloat11Model.from_pretrained.<locals>.<genexpr>  s,      NN%5<$-NNNNNNr0   zqWarning: Some model layers are on CPU. For inference, ensure the model is fully loaded onto CUDA-compatible GPUs.)0r   r   isfileisdirIsADirectoryErrorFileNotFoundErrorexistsreplacer   r   rm   rl   openr   jsonloadtransformersr   r   r   transformers.modeling_utilsr   from_pretrainedfrom_configr#   r)   tie_weightsevalgeneration_config	Exceptionr!   r   rA   r   AttributeErrorr   r   valuesnbytesr*   r   rE   r   r
   r   r	   any
parameters)clsr   r   r   r   bfloat16_modelr   r   r   r   rr   kwargsdfloat11_model_pathconfigrw   r   r   r   r   r   r   er   model_bytesr   r   s                             r.   r   zDFloat11Model.from_pretrainedV  s   F  	bw~~9:: e&A##:;; e'  )p  Ql  )p  )p  )p  q  q  q'(c5P(c(c(cdddw~~9:: b&A##&A&I&I#t&T&T#w~~&9:: b%&AM`aaaa  !	 *%#*->,<(4	( ( "',,':MJJCZabbb *fg!Yq\\F* * * * * * * * * * * * * * * #EEWWWWWWWWWWCCCCCC  //0CDDF ""  8,8 (-:@  !!###

              $4$D$DEX$Y$Y!*;''    fd## 	I(9V(C(C$%67OOV.// 	I$4OO   "H  I  I  I 	!&#8J!4D	
 	
 	
 	
  	QK))++2244 , ,u|+B{S'8BBBBPPPP  	XHHV$$EE''')i'''3E?>;Z[[,UzcstttJ.ueuvvvJ"5*55E NN5;K;K;M;MNNNNN X  J  QW  X  X  X  Xs7   E  EE=AGGGG5 5
H	H	r   c
                 @    |                      ||||||||	d|
  
        S )NT)
r   r   r   r   r   r   r   r   r   rr   )r   )
r   r   rr   r   r   r   r   r   r   r   s
             r.   r   zDFloat11Model.from_single_file  s?     ""(;!!)#1!!% # 
 
 	
r0   )	Nr   NNFNTFN)Nr   NFNT)r1   r2   r3   r4   classmethodr"   r   r   r   rG   boolr   listr   r   r6   r0   r.   r   r   P  s        
  !% GK!,0!&7;{ {%({ { 	{
 T%S/5c?"BCD{ { %SM{ { { tCcN34{ { { [{|  !% GK!,0
 
 
 3S	>*

 
 
 T%S/5c?"BCD
 
 %SM
 
 
 
 [
 
 
r0   r   i'  rr   	save_pathblock_rangesave_single_filecheck_correctnessc                   12 t          j        |d           d}d}|                                D ]F\  }}	|                                 D ]*\  2}
t	          j        |2          r|sUd2v r@2                    dd          \  }}|                     |          }t          ||d            nt          | 2d            |dz  }||d         k    r||d         k    rd} ng }t          |
t          j                  r|
j        j        j        t          j        k    sJ d2 d|
j        j        j                     |                    |
j        j                                                                                                                   t+          |
d	           nt          |
t          j                  r|
j        j        j        t          j        k    sJ d2 d|
j        j        j                     |                    |
j        j                                                                                                                   t+          |
d	           n|	D ]}|                    d          }|
}|D ]}t1          ||          }|j        j        j        t          j        k    s"J d2 d| d|j        j        j                     |                    |j        j                                                                                                                   t+          |d	           t3          t          j        |                    \  }}t7          |          \  }}}|                                 t;          |          }t=          ||t>          t@          d                   \  }}}}}|rt          j!        d
          1|j"        d         }|#                                }|#                                } tI          tK          1fd|||||g                    \  }!}"}#}$}%t          j&        |t          j        1          }&tO          tQ          j)        | t@          d         t>          z  z                      f}'|*                    t          j+                  ,                                }(t@          d         dz  dz   |(dd          |(d d         z
  -                                .                                dz  z   })t_          d|) d           t`          j1        2                    1j3                  5  ti          |'t@          |)|!5                                |"5                                |#5                                |$5                                |%5                                |&5                                || |g	           d d d            n# 1 swxY w Y   t          j        |          |&                                k    6                                .                                }*|*rt_          d           nto          d2 d          |
8                    d|           |
8                    d|           |
8                    d|           |
8                    d|*                    t          j9                             |
8                    d|           |
8                    d|           |sy|
:                                }+2fd|+                                D             }+tw          |+t           j<        =                    |2>                    dd          dz                        ,H|rt~          t@          t>          |d},t          | d           r	 |,| jA        _B        n# t          $ r Y nw xY wt          | d!          r|s| D                    |           n@tw          | :                                t           j<        =                    |d"                     d}-t           j<        E                    t           j<        =                    |d#                    r}t          t           j<        =                    |d#          d$d%&          5 }.t          jH        |.          }/d'|/v rt          |/d'         t                    rd}-d d d            n# 1 swxY w Y   |-rdt          t           j<        =                    |d#          d(          5 }0t          jJ        d'|,i|0d)           d d d            d S # 1 swxY w Y   d S d S d S )*NT)exist_okr   r{   r   Fz@Expected weights to be in bfloat16 format for compression, but 'z' has dtype rW   zcuda:0c                 .    |                                S )N)rE   )xr   s    r.   <lambda>z compress_model.<locals>.<lambda>7  s#    uvuyuy  {A  vB  vB r0   r   r}   r|   r~   zUsing z bytes of shared memory.r;   uN   ✅ Correctness check passed: decompressed weights match the original weights.ue   ❌ Correctness check failed: The decompressed weights do not match the original weights for module "r   r@   r   r   rP   rQ   r\   c                 &    i | ]\  }} d | |S )r{   r6   )rv   keyvaluer   s      r.   
<dictcomp>z"compress_model.<locals>.<dictcomp>Z  s-    !c!c!c*#uY"6"6"6"6!c!c!cr0   rb   rt   r   r   save_pretrainedzmodel.safetensorsr   r   r   r   r   w)indent)Kr   makedirsrB   r   r   r   rsplitget_submoduler   r!   rR   rX   rW   r   r   r#   r)   r   detachr   flattenr`   rS   r   rC   r   catr   print_code_tabler   r   rl   rm   r   rF   r%   r   mapr(   rG   nprI   rT   r   r   r   r   r*   rJ   r&   rK   rL   rM   rO   allRuntimeErrorrD   uint8r   r   r   r   r   r   rA   r   r   r   r   r   r   r   r   r   dump)3r   rr   r   r   r   r   block_index
save_modelr   r   rj   parent_name
child_nameparentri   r   r   r   r   _codec_countercodecrb   tabler@   encodedother_8bitsrP   rQ   r\   rf   r+   re   	cuda_lutscuda_encodedcuda_other_8bitscuda_output_positions	cuda_gapscuda_outputsrh   r   rN   _is_correctr   r   save_configrw   r   config_filer   r   s3                                                    @@r.   compress_modelr$    s
    K	D))))KJ+1133 _q _q%*%8%8%:%: ]	q ]	q!Iz|GY// \q' 8i''2;2B2B32J2J/Z!&!4!4[!A!A
D9999y$777q +a.00 ;q>11!&JEj",77 2%,175>III Q[d  Q  Qr|  sD  sI  sO  Q  Q JIINN:#4#9#@#@#B#B#F#F#H#H#P#P#R#RSSSJ1111
BI66 2%,175>III Q[d  Q  Qr|  sD  sI  sO  Q  Q JIINN:#4#9#@#@#B#B#F#F#H#H#P#P#R#RSSSJ1111 &0 	2 	2	 ) 4 4!+!& 8 8A%,VQ%7%7FF%}175>III ]_h  ]  ]kt  ]  ]  CI  CP  CU  C[  ]  ]  JIIv}'9'@'@'B'B'F'F'H'H'P'P'R'RSSS1111#,UYw-?-?#@#@ "1(";";q%&&(((P^_fhmo  BS  TU  BV  QW  QWM&6o$ b"\(33F!Z]F!,!2!2!4!4J%mmooGbfgj  lB  lB  lB  lB  EI  KR  T_  aq  sw  Dx  hy  hy  cz  cz_I|-=?TV_#(;zX^#_#_#_L'*277>OPQ>RUe>e3f+g+g'h'h&kO*:*?*?*M*M*S*S*U*U'&7&:Q&>&BFYZ[Z\Z\F]`stwuwtw`xFxE}E}EE  FE  FE  FG  FG  JK  FK  'KOL?LLLMMM55 	 	_<MZi%..00(1133,55771::<<%..00(1133"GZq    	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 $)9W#5#59I9I9K9K#K"P"P"R"R"W"W"Y"YK" bnoooo*  ,a  T]  ,a  ,a  ,a  b  b  b**64888**+=wGGG**?KHHH**+=?O?T?TUZU`?a?abbb**64888**+<oNNN' q!+!6!6!8!8J!c!c!c!cPZP`P`PbPb!c!c!cJj"',,y)BSBSTWY\B]B]`nBn*o*oppp *!2 0(	
 
 5(## 	/>,,    5+,, 	X5E 	X!!),,,,e&&(("',,yBU*V*VWWW7>>"',,y-@@AA 	(bgll9m<<cGTTT (XY1$..:fEV>WY]3^3^."'K	( ( ( ( ( ( ( ( ( ( ( ( ( ( (  	*bgll9m<<cBB *k	%q* * * ** * * * * * * * * * * * * * * * * *7* *4	* 	*sI   BWW"W#^0 0
^=<^=6cc cd;;d?d?)FNTF)6rH   r   r   r   pkg_resourcessysr   typingr   r   r   r   r#   torch.nnrR   cupyrJ   r   r  
accelerater   r	   accelerate.utilsr
   huggingface_hubr   safetensors.torchr   r   dfloat11_utilsr   r   r   r   resource_filenameptx_path	RawModuleget_functionrM   r   rl   rm   r   r   rp   r   r   r   r   r"   r   rG   r   r$  r6   r0   r.   <module>r3     s    				 				            ( ( ( ( ( ( ( ( ( (                      < < < < < < < < 0 0 0 0 0 0 - - - - - - 2 2 2 2 2 2 2 2 P P P P P P P P P P P P +=*:|DD
",H
%
%
%
2
28
<
<    
0 0 0 0 0 0 0 0fI I I` A A A AH  2]
 ]
 ]
 ]
 ]
 ]
 ]
 ]
H  Z!"L* L*sDI~&L* L* c	L*
 L* L* L* L* L* L* L*r0   