
    
9jo                        S SK r S SKrS SKrS SKrS SKrS SKJr  S SKJrJ	r	J
r
  S SKJr  S SKrS SKJr  S SKrS SKrS SKJrJr  S SKJr  S SKJr  S SKJrJr  S	S
KJrJrJ r J!r!  \RD                  " SS5      r#\RH                  " \#S9RK                  S5      r&Sr'Sr(Sr)Sr* " S S5      r+S r,    S"S jr-S r. " S S5      r/S S/SS4S\0\1\2\1   4   S\1S\2\3   S\4S \44
S! jjr5g)#    N)stderr)OptionalDictUnion)tqdm)infer_auto_device_mapdispatch_model)get_balanced_memory)snapshot_download)	load_file	save_file   )	get_codecget_32bit_codecget_lutsencode_weightsdfloat11z
decode.ptx)pathdecodez0.5.0   )i   )encoded_exponentsign_mantissac                   ,    \ rS rSrSr0 r\S 5       rSrg)TensorManager4   z
Static utility class that manages tensor allocation and reuse
to minimize memory allocation overhead during tensor reconstruction.
c                    [        U [        5      (       a  [        R                  " U 5      n U [        R
                  ;   a[  [        R
                  U    nUR                  5       U:  a  USU $ [        R
                  U 	 [        R                  R                  5         [        R                  " U[        R                  U S9n[        SU SU  3[        S9  U[        R
                  U '   U$ )a  
Get a bfloat16 tensor with at least n_elements on the specified device.

If a tensor already exists on the device and is larger than n_elements,
a slice of the tensor with exactly n_elements is returned. If n_elements 
exceeds the size of the existing tensor, the existing tensor is deallocated 
and a larger one is allocated.

Args:
    device: The device to allocate the tensor on (e.g., 'cuda:0')
    n_elements: The exact number of elements required

Returns:
    A bfloat16 tensor with exactly n_elements on the specified device
Ndtypedevicez
Allocated z bf16 on device file)
isinstancestrtorchr   r   _tensorsnumelcudaempty_cacheemptybfloat16printr   )r   
n_elementsexisting_tensor
new_tensors       K/home/wildlama/miniconda3/lib/python3.13/site-packages/dfloat11/dfloat11.pyallocate_bfloat16TensorManager.allocate_bfloat16<   s    $ fc""\\&)F ]++++44V<O $$&*4&{
33 &&v.JJ""$ [[5>>&Q

:,&6vh?fM *4v&     N)	__name__
__module____qualname____firstlineno____doc__r%   staticmethodr0   __static_attributes__r3   r2   r/   r   r   4   s     
 H' 'r2   r   c                 .   ^ ^ [        T 5      m UU 4S jnU$ )a  
Creates a PyTorch forward pre-hook that decodes compressed DFloat11 weights on-the-fly.

This hook reconstructs full-precision weights from compressed representations
using a custom CUDA kernel during the forward pass.

Args:
    threads_per_block: CUDA thread configuration 
    bytes_per_thread: Number of bytes processed per CUDA thread
    
Returns:
    A forward pre-hook function for PyTorch modules
c                   > U R                   R                  n[        U S5      (       am  U R                  R	                  5        HO  u  p4[        X5      (       a  [        X5      R                  U:X  a  M0  U R                  X4R                  USS95        MQ     U R                  R                  5       nU R                  R                  5       nU R                   R                  S   n[        R                  X%5      n[        [        R                   " UTS   T-  -  5      5      4n	["        R$                  R'                  UR(                  5         [+        U	TU R,                  U R                   R/                  5       U R                  R/                  5       U R                  R/                  5       U R0                  R/                  5       U R2                  R/                  5       UR/                  5       XvU/	S9  S S S 5        [5        U [6        R8                  5      (       a,  UR;                  U R<                  U R>                  5      U l         O[5        U [6        RB                  5      (       a,  UR;                  U RD                  U RF                  5      U l         Oj[H        RJ                  " XRL                  5      n
[O        U RP                  U
5       H0  u  pUR;                  UR<                  UR>                  5      Ul         M2     [        U S5      (       aK  U R                  RS                  5        H,  n[        X5      (       d  M  [        X5      n[U        X5        AM.     g g ! , (       d  f       GNl= f)Noffloaded_tensorsT)non_blockingr   gridblock
shared_memargs)+lutsr   hasattrr=   itemsgetattrregister_buffertor   r&   r   shaper   r0   intmathceilcpr'   Deviceindex_decodeshared_mem_sizedata_ptroutput_positionsgapsr"   nnLinearviewout_featuresin_featuresweight	Embeddingnum_embeddingsembedding_dimr$   tensor_splitsplit_positionszipweight_injection_moduleskeysdelattr)module_r   tensor_nametensorr,   n_bytesn_lutsreconstructedblocks_per_gridweights
sub_moduler[   tmpbytes_per_threadthreads_per_blocks                 r/   decode_hookget_hook.<locals>.decode_hookw   s   ## 6.//'-'?'?'E'E'G#F00gf6R6Y6Y]c6c**;		&W[	8\]	 (H ))//1
))//1""1% &77K tyy4Ea4HK[4[)\]^a WW^^FLL)0AfNdNd$$&''002$$--/''002$$&&&(l  * fbii(()..##V%7%7FM --)..%%v';';FM
 ((8N8NOG&)&*I*I7&S"
$*KK
0G0GI_I_$`
! 'T 6.//%77<<>6//!&6CF0	  ? 07 *)s   
B%M
M#)tuple)rq   rp   rr   s   `` r/   get_hookru   g   s     /07r r2   Tc                 
   US   nUS   nUS   n	U(       d>  [         R                  " U5       V
s/ s H  oR                  S5      (       d  M  U
PM     sn
OU/nSnU(       a  US-  nU(       a  US-  nUS-  n[        XS	9 GH  nU(       d  [         R                  R                  X5      OUn[        U5      nUR                  5        GH4  u  nnUU R                  5       ;   Ga  U[        U R                  5       5      ;   a  [        U R                  5       5      U   nUR                  UR                  :X  a  UR                  R                  U5        M  [        S
U SUR                   SUR                   3[        S9  M  [        U R!                  5       5      U   nUR                  UR                  :X  a  UR                  U5        GM  [        S
U SUR                   SUR                   3[        S9  GM1  UR#                  S5      nU n[%        USS 5       H8  u  nn['        UU5      (       a  [)        UU5      nM%  [        SU 3[        S9    GM     US   S:X  a  [+        USUR-                  5       5        OU(       a  Ub  US:  a  US   [.        ;   a|  ['        US5      (       d  [+        US0 5        U(       a  UR1                  5       OUUR2                  US   '   Ub1  US:  a+  [5        UR2                  5      [5        [.        5      :X  a  US-  nOUR7                  US   U5        US   S:X  GaO  UR9                  [;        Xx5      5        U	R                  5        GH  u  nn[<        R>                  " USR                  USS 5      5      (       d  M7  [A        U[B        RD                  5      (       a  URF                  n[I        US5        AMq  [A        U[B        RJ                  5      (       a  URF                  n[I        US5        AM  [+        US/ 5        U H_  nUR#                  S5      nUnU H  n[)        UU5      nM     URF                  n[I        US5        AURL                  RO                  U5        Ma     GM      GM  US   S:X  d  GM  URQ                  [R        RT                  5      RW                  5       n[+        USUS   S-  S-   USS USS -
  RY                  5       R[                  5       S-  -   5        GM7     GM     U $ s  sn
f )a{  
Loads DFloat11 compressed weights from safetensors files and configures the model
to use them with on-the-fly decompression.

Args:
    model: The PyTorch model to load weights into
    directory_path: Path to the directory containing safetensors files
    dfloat11_config: Configuration for DFloat11 compression
    
Returns:
    The model with configured DFloat11 compression
rq   rp   pattern_dict.safetensorszLoading DFloat11 safetensorsz (offloaded to CPUz, memory pinned))desczShape mismatch for z: model z vs loaded r    .NzCannot find module path for r`   r   r=   r   r   r[   rb   rT   rR         ).oslistdirendswithr   r   joinr   rF   
state_dictdictnamed_parametersrJ   datacopy_r+   r   named_bufferssplit	enumeraterE   rG   setattrtolistoffloaded_tensor_names
pin_memoryr=   lenrH   register_forward_pre_hookru   re	fullmatchr"   rV   r\   r[   rd   rW   rb   appendrX   r$   uint32numpymaxitem)modeldirectory_pathdfloat11_configcpu_offloadcpu_offload_blocksr   from_single_filerq   rp   rw   fsafetensors_filesloading_desc	file_name	file_pathloaded_tensorsrg   tensor_valueparambufferpartsre   ipartpattern
attr_namesro   	attr_pathtargetpoutput_positions_nps                                  r/   load_and_replace_tensorsr      s   * ((;<'(:;'7L
  ::n--aN1K-$2#3  2L,,--L+?	CSBGGLL;Yb	 #9- *8)=)=)?%Ke..00$u'='='?"@@ !7!7!9:;GE{{l&8&88

((6 3K=U`amasas`tu  }C  D "%"5"5"78EF|||'9'99\2 3K=Vabnbtbtauv  ~D  E $))#.  )s4GAtvt,,!(!6 <[MJQWX  5 Ry$55(9<;N;N;PQ&,>,FJ\_`J`fklnfo  tJ  gJ#*63F#G#G '0CR H_i,BYBYB[o{F44U2Y? 2 >EWZ[E[beflf~f~b  DG  H^  D_  c_ 2a 7 2 #2259lK Ry$6688BS9fg 4@3E3E3G/GZ!||GSXXeCRj5IJJ#-fbll#C#C*0--C$+FH$=(+%/		%B%B*0--C$+FH$=(+ %,F4NPR$S5?	090D1716A5<VQ5GF 27 /5mm(/(A,/(.(G(G(N(Nv(V 6@ 4H2 r&88.:.?.?.M.S.S.U+"--a014q8<OPQPR<SVijmkmVn<n;s;s;u;z;z;|  @A  <A  Aa *@ @z LOs   TTc                     / nU Hg  nU R                  5        HP  u  pE[        R                  " X45      (       d  M"  UR                  R                  nXb;  d  M?  UR                  U5        MR     Mi     U$ )a~  
Find model layer classes that should not be split across devices.

This is crucial for DFloat11 model sharding to ensure compressed modules
stay on the same device as their decompression buffers.

Args:
    model: The PyTorch model
    pattern_dict: Dictionary mapping regex patterns to submodule lists
    
Returns:
    List of class names that should not be split across devices
)named_modulesr   r   	__class__r4   r   )r   rw   no_split_classesr   	full_namern   
class_names          r/   get_no_split_classesr   7  sf     %*%8%8%:!I||G//'11::
5$++J7	 &;   r2   c                   0   \ rS rSrSr\         SS\S\\   S\S\\\	\
\4   \	\
\4   4      S\S	\\
   S
\S\S\\\\\   4      4S jj5       r\      SS\S\\\\   4   S\\   S\S\\\	\
\4   \	\
\4   4      S\S	\\
   S
\4S jj5       rSrg)DFloat11ModeliP  z
Wrapper class for loading and using models with DFloat11 compressed weights.
DFloat11 is a custom 11-bit floating point format that provides memory efficiency
while maintaining numerical accuracy for LLM weights.
Ndfloat11_model_name_or_pathr   
device_map
max_memoryr   r   r   r   rw   c           
         U	(       ai  [         R                  R                  U5      (       a  UnO[         R                  R                  U5      (       a  [	        SU S35      e[        SU S35      e[         R                  R                  U5      (       a  UnO?UR                  SS5      n[         R                  R                  U5      (       d	  [        XS9  U(       ah  U	(       a  S[        [        [        U
S	.0nOH[        [         R                  R                  US
5      SSS9 n[        R                  " U5      nSSS5        UnOSSKJnJnJn  SSKJn  UR-                  U5      nU" 5          UR.                  " U4S[0        R2                  0UD6nUR5                  5         UR7                  5         SSS5         UR-                  U5      nUWl        [=        W[>        5      (       a  SU;   a  US   nO)[A        US5      (       a  URB                  nO[E        S5      e[G        WUUXgXS9  U(       dL  SnURI                  5       RK                  5        H  nUURL                  -  nM     [O        SUS-  S S3[P        S9  U(       a  URS                  U5      nU$ US:X  d   S5       e[U        UUS   5      n[W        XUS9n[Y        XUS9n[[        X5      n[]        S UR_                  5        5       5      (       a  [O        S[P        S9  U$ ! , (       d  f       GN= f! , (       d  f       GNn= f! [:         a  n SnAGNhSnAff = f) a  
Load a model with DFloat11 compressed weights from local path or Hugging Face Hub.

Args:
    dfloat11_model_name_or_path: Local path or HF Hub model name
    device: Target device for the model
    device_map: Strategy for distributing model across devices
    max_memory: Maximum memory allocation per device
    bfloat16_model: Optional pre-initialized model to load weights into
    cpu_offload: Enables CPU offloading; only keeps a single block of weights in GPU at once
    cpu_offload_blocks: Number of transformer blocks to offload to CPU; if None, offload all blocks
    pin_memory: Enables memory-pinning/page-locking when using CPU offloading
    from_single_file: Whether to load a single safetensors file
    pattern_dict: Dictionary mapping regex patterns to submodule lists
    **kwargs: Additional arguments passed to AutoModelForCausalLM.from_config
    
Returns:
    Model with DFloat11 compressed weights configured for on-the-fly decompression
zeExpected `dfloat11_model_name_or_path` to be the path to a safetensors file, but found a directory: "".z
The file "z" does not exist./__)	local_dirr   versionrq   rp   rw   config.jsonrutf-8encodingNr   )AutoModelForCausalLM
AutoConfigGenerationConfig)no_init_weightstorch_dtypezd"dfloat11_config" not found: it is expected to be found in the config file or passed as an argument.)r   r   r   r   zTotal model size: g    eAz0.4fz GBr    autoz>device_map should be 'auto' if no specific device is provided.rw   )r   no_split_module_classesc              3   R   #    U  H  oR                   R                  S :H  v   M     g7f)cpuN)r   type).0r   s     r/   	<genexpr>0DFloat11Model.from_pretrained.<locals>.<genexpr>  s     N;M%<<$$-;Ms   %'zqWarning: Some model layers are on CPU. For inference, ensure the model is fully loaded onto CUDA-compatible GPUs.)0r   r   isfileisdirIsADirectoryErrorFileNotFoundErrorexistsreplacer   r   rq   rp   openr   jsonloadtransformersr   r   r   transformers.modeling_utilsr   from_pretrainedfrom_configr$   r*   tie_weightsevalgeneration_config	Exceptionr"   r   rE   r   AttributeErrorr   r   valuesnbytesr+   r   rI   r   r
   r   r	   any
parameters)clsr   r   r   r   bfloat16_modelr   r   r   r   rw   kwargsdfloat11_model_pathconfigr   r   r   r   r   r   r   er   model_bytesr   r   s                             r/   r   DFloat11Model.from_pretrainedV  s)   F ww~~9::&A#:;;'  +P  Ql  Pm  mo  )p  q  q'*5P4QQb(cddww~~9::&A#&A&I&I#t&T#ww~~&9::%&Aa %#*->,<(4	( "'',,':MJCZabfg!YYq\F c #EWWC  //0CDF ",88(-:@ !!#

 #$4$D$DEX$Y!*;'
 fd##(9V(C$%67OV.//$44O   "H  I  I 	!&#!	
 K))+224u||+ 5 &{S'8&>cBP HHV$E  'i)ii'3E?>;Z[,UcstJ.ueuvJ"55E N5;K;K;MNNN  J  QW  X} cb #"  s+   $L.AL;L+ 
L
L(+
M ;M r   c
                 2    U R                  UUUUUUUU	SUS9
$ )NT)
r   r   r   r   r   r   r   r   r   rw   )r   )
r   r   rw   r   r   r   r   r   r   r   s
             r/   r   DFloat11Model.from_single_file  s:     ""(;!!)#1!!% # 
 	
r2   r3   )	Nr   NNFNTFN)Nr   NFNT)r4   r5   r6   r7   r8   classmethodr#   r   r   r   rK   boolr   listr   r   r:   r3   r2   r/   r   r   P  s   
  !% GK!,0!&7;{%({ { 	{
 T%S/5c?"BCD{ { %SM{ { { tCcN34{ {|  !% GK!,0
 
 3S	>*

 
 
 T%S/5c?"BCD
 
 %SM
 
 
r2   r   i'  rw   	save_pathblock_rangesave_single_filecheck_correctnessc                 *  ^4 [         R                  " USS9  SnSnUR                  5        GH  u  pU R                  5        GH  u  p[        R
                  " X5      (       d  M#  U(       dD  SU
;   a2  U
R                  SS5      u  pU R                  U5      n[        XS 5        O[        X
S 5        US-  nXcS   ::  a  M}  XcS   :  a  Sn  M  / n[        U[        R                  5      (       a  UR                  R                  R                  [        R                   :X  d+   SU
 SUR                  R                  R                   35       eUR#                  UR                  R                  R%                  5       R'                  5       R)                  5       5        [+        US	5        GO[        U[        R,                  5      (       a  UR                  R                  R                  [        R                   :X  d+   SU
 SUR                  R                  R                   35       eUR#                  UR                  R                  R%                  5       R'                  5       R)                  5       5        [+        US	5        OU	 H  nUR/                  S5      nUnU H  n[1        UU5      nM     UR                  R                  R                  [        R                   :X  d.   SU
 SU SUR                  R                  R                   35       eUR#                  UR                  R                  R%                  5       R'                  5       R)                  5       5        [+        US	5        M     [3        [        R4                  " U5      5      u  nn[7        U5      u  nnnUR9                  5         [;        U5      n[=        UU[>        [@        S   5      u  nnnnnU(       Ga8  [        RB                  " S
5      m4URD                  S   nURG                  5       n URG                  5       n![I        [K        U44S jUUUUU/5      5      u  n"n#n$n%n&[        RL                  " U [        R                   T4S9n'[O        [P        RR                  " U![@        S   [>        -  -  5      5      4n(URU                  [        RV                  5      RY                  5       n)[@        S   S-  S-   U)SS  U)S S -
  R[                  5       R]                  5       S-  -   n*[_        SU* S35        [`        Rb                  Re                  T4Rf                  5         [i        U([@        U*U"Rk                  5       U#Rk                  5       U$Rk                  5       U%Rk                  5       U&Rk                  5       U'Rk                  5       UU!U /	S9  S S S 5        [        R4                  " U5      U'R'                  5       :H  Rm                  5       R]                  5       n+U+(       a  [_        S5        O[o        SU
 S35      eURq                  SU5        URq                  SU5        URq                  SU5        URq                  SURU                  [        Rr                  5      5        URq                  SU5        URq                  SU5        U(       a  GMg  URu                  5       n,U,R                  5        V-V.s0 s H  u  n-n.U
 SU- 3U._M     n,n-n.[w        U,[         Rx                  R{                  X*R}                  SS5      S-   5      5        GM     GM     U(       Ga  [~        [@        [>        US.n/[        U S5      (       a   U/U R                  lB        [        U S 5      (       a  U(       d  U R                  U5        O8[w        U Ru                  5       [         Rx                  R{                  US!5      5        Sn0[         Rx                  R                  [         Rx                  R{                  US"5      5      (       ah  [        [         Rx                  R{                  US"5      S#S$S%9 n1[        R                  " U15      n2S&U2;   a  [        U2S&   [        5      (       a  Sn0S S S 5        U0(       aL  [        [         Rx                  R{                  US"5      S'5       n3[        R                  " S&U/0U3SS(9  S S S 5        g g g ! , (       d  f       GN)= fs  sn.n-f ! [         a     GNf = f! , (       d  f       N= f! , (       d  f       g = f))NT)exist_okr   r{   r   Fz@Expected weights to be in bfloat16 format for compression, but 'z' has dtype r[   zcuda:0c                 &   > U R                  T5      $ )N)rI   )xr   s    r/   <lambda> compress_model.<locals>.<lambda>7  s    uvuyuy  {A  vBr2   r   r}   r|   r~   zUsing z bytes of shared memory.r?   uN   ✅ Correctness check passed: decompressed weights match the original weights.ue   ❌ Correctness check failed: The decompressed weights do not match the original weights for module "r   rD   r   r   rT   rU   r`   rf   rx   r   r   save_pretrainedzmodel.safetensorsr   r   r   r   r   w)indent)Kr   makedirsrF   r   r   r   rsplitget_submoduler   r"   rV   r\   r[   r   r   r$   r*   r   detachr   flattenrd   rW   r   rG   r   catr   print_code_tabler   r   rp   rq   r   rJ   r&   r   mapr)   rK   nprM   rX   r   r   r   r   r+   rN   r'   rO   rP   rQ   rS   allRuntimeErrorrH   uint8r   r   r   r   r   r   rE   r   r   r   r  r   r   r   r   r   dump)5r   rw   r   r   r   r   block_index
save_modelr   r   r   rn   parent_name
child_nameparentrm   r   r   r   r   _codec_countercodecrf   tablerD   encodedother_8bitsrT   rU   r`   rj   r,   ri   	cuda_lutscuda_encodedcuda_other_8bitscuda_output_positions	cuda_gapscuda_outputsrl   r   rR   _is_correctr   keyvaluer   save_configr   r   config_filer   s5                                                       @r/   compress_modelr)    s_    KK	D)KJ+113%*%8%8%:!I||G//'i'2;2B2B32J/!&!4!4[!AD9$7q a.0 q>1!&Jj",,77%,,11775>>I QZ[dZeeqr|  sD  sD  sI  sI  sO  sO  rP  QQINN:#4#4#9#9#@#@#B#F#F#H#P#P#RSJ1
BII66%,,11775>>I QZ[dZeeqr|  sD  sD  sI  sI  sO  sO  rP  QQINN:#4#4#9#9#@#@#B#F#F#H#P#P#RSJ1 &0	 ) 4!+!&A%,VQ%7F "'  &}}11775>>I ]^_h^iijktju  vB  CI  CP  CP  CU  CU  C[  C[  B\  ]]Iv}}'9'9'@'@'B'F'F'H'P'P'RS1 &0 $-UYYw-?#@ "1(";q%&&(P^_fhmo  BS  TU  BV  QWM&6o$"\\(3F!ZZ]F!,!2!2!4J%mmoGbfgj  lB  EI  KR  T_  aq  sw  Dx  hy  cz_I|-=?TV_#(;;zX^#_L'*2777>OPQ>RUe>e3f+g'h&kO*:*?*?*M*S*S*U'&7&:Q&>&BFYZ[Z\F]`stwuw`xFxE}E}E  FE  FE  FG  JK  FK  'KOF?"33KLM5_<MZi%..0(113,5571::<%..0(113"GZq  6 $)99W#59I9I9K#K"P"P"R"W"W"YK"no*  .S  T]  S^  ^`  ,a  b  b**648**+=wG**?KH**+=?O?T?TUZU`U`?ab**648**+<oN''!+!6!6!8JPZP`P`Pb!cPb*#uYKq"6"=PbJ!cj"'',,yBSBSTWY\B]`nBn*op{ &;  4B !2 0(	
 5(##/>, 5+,,5E!!),e&&("'',,yBU*VW77>>"'',,y-@AAbggll9m<cGTXY1$.:fEV>WY]3^3^"'K	 U bggll9m<cBk		%q* CB 5 = 654 "d   UT CBs=   :A.c
c
c" 67c3&d

c"
c0/c03
d
d)FNTF)6rL   r   r   r   pkg_resourcessysr   typingr   r   r   r   r$   torch.nnrV   cupyrN   r   r  
accelerater   r	   accelerate.utilsr
   huggingface_hubr   safetensors.torchr   r   dfloat11_utilsr   r   r   r   resource_filenameptx_path	RawModuleget_functionrQ   r   rp   rq   r   r   ru   r   r   r   r   r#   r   rK   r   r)  r3   r2   r/   <module>r8     s    	 	    ( (      < 0 - 2 P P **:|D
,,H
%
2
28
<    
0 0fI` AH2]
 ]
H  Z!"L*sDI~&L* L* c	L*
 L* L*r2   