
    +jI                        d dl Zd dlmZ d dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZ d dlmZm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(  e) ed          *                    d                    Z+ ej,        e+          -                    d          Z.d Z/	 	 	 	 ddZ0 G d de          Z1dS )    N)files)stderr)infer_auto_device_mapdispatch_model)get_balanced_memory)	load_file	save_file)tqdm)OptionalDictUnion)DFloat11Model)TensorManagerget_no_split_classes)versionthreads_per_blockbytes_per_thread)	get_codecget_32bit_codecget_lutsencode_weights   )!convert_diffusers_to_comfyui_fluxdfloat11z
decode.ptx)pathdecodec                 4     t                       fd}|S )a  
    Creates a PyTorch forward pre-hook that decodes compressed DFloat11 weights on-the-fly.
    
    This hook reconstructs full-precision weights from compressed representations
    using a custom CUDA kernel during the forward pass.
    
    Args:
        threads_per_block: CUDA thread configuration 
        bytes_per_thread: Number of bytes processed per CUDA thread
        
    Returns:
        A forward pre-hook function for PyTorch modules
    c                    | j         j        }t          | d          rs| j                                        D ]Y\  }}t          | |          rt          | |          j        |k    s+|                     ||                    |d                     Z| j        	                                }| j
        	                                }| j         j        d         }t          j        ||          }t          t          j        |d         z  z                      f}	t"          j                            |j                  5  t+          |	| j        | j                                         | j
                                        | j                                        | j                                        | j                                        |                                |||g	           d d d            n# 1 swxY w Y   t5          | j                  dk    rt9          j        |dd         |dd	         |dd         |d	d          f          |d d <   t9          j        || j                  }
|
d                              | j        d         j!        | j        d         j"                  | j        d         _#        |d
d                              dd          | j        d         _#        |
d                              | j        d         j!        | j        d         j"                  | j        d         _#        |
d                              | j        d         j!        | j        d         j"                  | j        d         _#        |
d                              | j        d         j!        | j        d         j"                  | j        d         _#        |
d                              | j        d         j!        | j        d         j"                  | j        d         _#        |dd	                              dd          | j        d         _#        |
d                              | j        d         j!        | j        d         j"                  | j        d         _#        |
d                              | j        d         j!        | j        d         j"                  | j        d         _#        |
d                              | j        d         j!        | j        d         j"                  | j        d         _#        nDt5          | j                  dk    r t9          j        |d
d         |dd         |dd
         |dd         f          |d d <   t9          j        || j                  }
|dd                              dd          | j        d         _#        |
d                              | j        d         j!        | j        d         j"                  | j        d         _#        |d
d                              dd          | j        d         _#        n+tI          t5          | j                   d| j                   t          | d          rM| j        %                                D ]5}t          | |          r!t          | |          }tM          | |           ~4d S d S )Noffloaded_tensorsT)non_blockingr   )gridblock
shared_memargs
   i  pi  	i   
i  i $  i   r                        	            i  i  i T  z weight_injection_modules 
)'lutsdevicehasattrr   itemsgetattrregister_buffertosign_mantissanumelencoded_exponentshaper   allocate_bfloat16intmathceilcpcudaDeviceindex_decodeshared_mem_sizedata_ptroutput_positionsgapslenweight_injection_modulestorchcattensor_splitsplit_positionsviewout_featuresin_featuresweight	Exceptionkeysdelattr)module_r2   tensor_nametensor
n_elementsn_bytesn_lutsreconstructedblocks_per_gridweightstmpr   r   s               Y/home/wildlama/comfy/ComfyUI/custom_nodes/ComfyUI-DFloat11-Extended/dfloat11_diffusers.pydecode_hookz,get_hook_flux_diffusers.<locals>.decode_hook4   s   # 6.// 	^'-'?'E'E'G'G ^ ^#VFK00^6=fk6R6R6Y]c6c6c**;		&W[	8\8\]]] )//11
)//11"1% &7
KK ty4Ea4HK[4[)\]]^^a W^^FL)) 		 		0AfNd$$&&'0022$--//'0022$$&&&&((l    		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 v.//255  %y-)*DmT]^gTgFhjw  yB  CL  yL  kM  O\  ]f  ]g  ]g  Oh  *i   j   jM!!!(8NOOG8?
HghiHjHwy  zY  Z[  z\  zh  9i  9iF+A.58EiR[F[8\8a8abfhl8m8mF+A.58?
HghiHjHwy  zY  Z[  z\  zh  9i  9iF+A.58?8H8HIhijIkIx  {A  {Z  [\  {]  {i  9j  9jF+A.58?8H8HIhijIkIx  {A  {Z  [\  {]  {i  9j  9jF+A.58?
HghiHjHwy  zY  Z[  z\  zh  9i  9iF+A.58EiR[F[8\8a8abfhl8m8mF+A.58?
HghiHjHwy  zY  Z[  z\  zh  9i  9iF+A.58?8H8HIhijIkIx  {A  {Z  [\  {]  {i  9j  9jF+A.58?8H8HIhijIkIx  {A  {Z  [\  {]  {i  9j  9jF+A.55 011Q66  %y-	)8K*Lm\dem\mNnp}  G  HQ  Q  qR  Ta  bc  dl  bl  Tm  *n   o   oM!!!(8NOOG8Eaj8Q8V8VW\^b8c8cF+A.58?
HghiHjHwy  zY  Z[  z\  zh  9i  9iF+A.58EiPYFY8Z8_8_`dfj8k8kF+A.55 s6#BCC  C  Cag  bA  C  C  D  D  D 6.// 	%7<<>>  6;// !&+66CFK000	 	 s   <B'G//G36G3)tuple)r   r   rb   s   `` ra   get_hook_flux_diffusersrd   $   sB     /00M M M M M M^     FTc                 n
   |d         }|d         }|d         }	|sd t          j        |          D             n|g}
d}|r|dz  }|r|dz  }|dz  }t          |
|	          D ]}|s t           j                            ||          n|}t          t          |                    }|                                D ]x\  }}||                                 v r|t          | 
                                          v r|t          | 
                                          |         }|j        |j        k    r|j                            |           t          d
| d|j         d|j         t                     t          |                                           |         }|j        |j        k    r|                    |           
t          d
| d|j         d|j         t                     5|                    d          }| }t%          |dd                   D ]A\  }}t'          ||          rt)          ||          }&t          d| t                      n|d         dk    r$t+          |d|                                           n|r||dk    r|d         t.          v rt'          |d          st+          |di            |r|                                n||j        |d         <   |5|dk    r/t5          |j                  t5          t.                    k    r|dz  }n|                    |d         |           |d         dk    rK|                    t;          ||                     |	                                D ]\  }}t=          j        |d                    |dd                             rtA          |tB          j"                  r|j#        }tI          |d           ~itA          |tB          j%                  r|j#        }tI          |d           ~t+          |dg            |D ]`}|                    d          }|}|D ]}t)          ||          }|j#        }tI          |d           ~|j&        '                    |           a|d         dk    r|(                    tR          j*                  +                                }t+          |d|d         dz  dz   |dd         |dd         z
  ,                                -                                dz  z              z| S )a  
    Loads DFloat11 compressed weights from safetensors files and configures the model
    to use them with on-the-fly decompression.
    
    Args:
        model: The PyTorch model to load weights into
        directory_path: Path to the directory containing safetensors files
        dfloat11_config: Configuration for DFloat11 compression
        
    Returns:
        The model with configured DFloat11 compression
    r   r   pattern_dictc                 <    g | ]}|                     d           |S )z.safetensors)endswith).0fs     ra   
<listcomp>z;load_and_replace_tensors_flux_diffusers.<locals>.<listcomp>   s9       N1K1K	  re   zLoading DFloat11 safetensorsz (offloaded to CPUz, memory pinned))desczShape mismatch for z: model z vs loaded file.NzCannot find module path for rN   r   r   r   r:   rR   rJ   rG   rE   r*   r'   ).oslistdirr
   r   joinr   r   r4   
state_dictdictnamed_parametersr;   datacopy_printr   named_bufferssplit	enumerater3   r5   setattrtolistoffloaded_tensor_names
pin_memoryr   rI   r6   register_forward_pre_hookrd   re	fullmatch
isinstancenn	EmbeddingrR   rU   LinearrJ   appendrO   rK   uint32numpymaxitem)modeldirectory_pathdfloat11_configcpu_offloadcpu_offload_blocksr   from_single_filer   r   rg   safetensors_filesloading_desc	file_name	file_pathloaded_tensorsrX   tensor_valueparambufferpartsrV   ipartpattern
attr_namesr`   	attr_pathtargetpoutput_positions_nps                                 ra   'load_and_replace_tensors_flux_diffusersr      sS   * ((;<'(:;'7L
 4  :n--   $2#3  2L ,, 	.--L+,??? \ \	CSbBGLL;;;Yb	 ;9Y;O;OPP *8)=)=)?)? U	 U	%Ke..0000$u'='='?'?"@"@@@ !7!7!9!9::;GE{l&888
((6666uKuuuuamasuu  }C  D  D  D  D  D "%"5"5"7"788EF||'999\2222vKvvvvbnbtvv  ~D  E  E  E  E  E $))#..  )ss44 = =GAtvt,, !(!6!6J[JJQWXXXXRy$555(9<;N;N;P;PQQQQ& 
L,>,FJ\_`J`J`fklnfo  tJ  gJ  gJ#*63F#G#G I '0CR H H H_iB{,BYBYB[B[B[o{F4U2Y? 2 >EWZ[E[E[beflf~bb  DG  H^  D_  D_  c_  c_ 2a 7 2 #2259lKKK Ry$666889PQbdt9u9uvvv 4@3E3E3G3G W W/GZ!|GSXXeCRCj5I5IJJ W#-fbl#C#C !W*0-C$+FH$=$=$=(+%/	%B%B !W*0-C$+FH$=$=$=(+ %,F4NPR$S$S$S5? 
%W 
%W	090D0D1716 )H )HA5<VQ5G5GFF.4m(/(A(A(A,/(.(G(N(Nv(V(V(V(V3W4 r&888.:.?.?.M.M.S.S.U.U+"--a014q8<OPQPRPR<SVijmkmjmVn<n;s;s;u;u;z;z;|;|  @A  <A  A  cU	n Lre   c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddedee         ded	eeee	ef         ee	ef         f                  d
e
dee	         de
de
deeeee         f                  fd            Z xZS )DFloat11FluxDiffusersModelc                 H    t                                                       d S )N)super__init__)self	__class__s    ra   r   z#DFloat11FluxDiffusersModel.__init__  s    re   NautoFTdfloat11_model_name_or_pathr2   
device_map
max_memoryr   r   r   r   rg   c           	         |	rgt           j                            |          r|}nt           j                            |          rt	          d| d          t          d| d          t           j                            |          r|}nF|                    dd          }t           j                            |          st          ||           t          d           |r|	rd	t          t          t          |
d
i}nbt          t           j                            |d          dd          5 }t                              |          }ddd           n# 1 swxY w Y   |}nt#          d          t%          |t&                    rd	|v r	|d	         }n't)          |d	          r|j        }nt-          d          t/          |||||||	           |sSd}|                                                                D ]}||j        z  }t          d|dz  ddt6                     |r|                    |          }n|dk    s
J d            t;          ||d                   }t=          |||          }t?          |||          }tA          ||          }tC          d |"                                D                       rt          dt6                     |S )aU  
        Load a model with DFloat11 compressed weights from local path or Hugging Face Hub.
        
        Args:
            dfloat11_model_name_or_path: Local path or HF Hub model name
            device: Target device for the model
            device_map: Strategy for distributing model across devices
            max_memory: Maximum memory allocation per device
            bfloat16_model: Optional pre-initialized model to load weights into
            cpu_offload: Enables CPU offloading; only keeps a single block of weights in GPU at once
            cpu_offload_blocks: Number of transformer blocks to offload to CPU; if None, offload all blocks
            pin_memory: Enables memory-pinning/page-locking when using CPU offloading
            from_single_file: Whether to load a single safetensors file
            pattern_dict: Dictionary mapping regex patterns to submodule lists
            **kwargs: Additional arguments passed to AutoModelForCausalLM.from_config
            
        Returns:
            Model with DFloat11 compressed weights configured for on-the-fly decompression
        zeExpected `dfloat11_model_name_or_path` to be the path to a safetensors file, but found a directory: "z".z
The file "z" does not exist./__)	local_dirz0Using overriden DFloat11FluxDiffusersModel classr   )r   r   r   rg   zconfig.jsonrzutf-8)encodingNz"`bfloat16_model` must be specifiedzd"dfloat11_config" not found: it is expected to be found in the config file or passed as an argument.)r   r   r   r   r   zTotal model size: g    eAz0.4fz GBro   r   z>device_map should be 'auto' if no specific device is provided.rg   )r   no_split_module_classesc              3   6   K   | ]}|j         j        d k    V  dS )cpuN)r2   type)rj   r   s     ra   	<genexpr>z=DFloat11FluxDiffusersModel.from_pretrained.<locals>.<genexpr>z  s,      NN%5<$-NNNNNNre   zqWarning: Some model layers are on CPU. For inference, ensure the model is fully loaded onto CUDA-compatible GPUs.)#rs   r   isfileisdirIsADirectoryErrorFileNotFoundErrorexistsreplacesnapshot_downloadr{   r   r   r   openru   jsonloadrS   r   rw   r3   r   AttributeErrorr   rv   valuesnbytesr   r7   r   r   r   r   any
parameters)clsr   r2   r   r   bfloat16_modelr   r   r   r   rg   kwargsdfloat11_model_pathconfigrk   r   r   model_bytesr   no_split_classess                       ra   from_pretrainedz*DFloat11FluxDiffusersModel.from_pretrained  s   F  	bw~~9:: e&A##:;; e'  )p  Ql  )p  )p  )p  q  q  q'(c5P(c(c(cdddw~~9:: b&A##&A&I&I#t&T&T#w~~&9:: b%&AM`aaaa@AAA  	B *%#*->,<(4	( ( "',,':MJJCZabbb *fg!YYq\\F* * * * * * * * * * * * * * * #EE@AAA fd## 	I(9V(C(C$%67OOV.// 	I$4OO   "H  I  I  I 	0&#8J!4D	
 	
 	
 	
  	QK))++2244 , ,u|+B{S'8BBBBPPPP  	XHHV$$EE''')i'''3E?>;Z[[,UzcstttJ.ueuvvvJ"5*55E NN5;K;K;M;MNNNNN X  J  QW  X  X  X  Xs   .EEE)	Nr   NNFNTFN)__name__
__module____qualname__r   classmethodstrr   r   r   r=   boolrw   listr   __classcell__)r   s   @ra   r   r     s             !% GK!,0!&7;l l%(l l 	l
 T%S/5c?"BCDl l %SMl l l tCcN34l l l [l l l l lre   r   )FNTF)2cupyr@   importlib.resourcesr   r   r>   uuidrs   rK   torch.nnr   sysr   
accelerater   r   accelerate.utilsr   safetensors.torchr   r	   r
   typingr   r   r   r   r   dfloat11.dfloat11r   r   r   r   r   dfloat11.dfloat11_utilsr   r   r   r   convert_fixed_tensorsr   r   joinpathptx_path	RawModuleget_functionrD   rd   r   r    re   ra   <module>r      s%       % % % % % % 				   				              < < < < < < < < 0 0 0 0 0 0 2 2 2 2 2 2 2 2       ( ( ( ( ( ( ( ( ( ( " " " " " " A A A A A A A A J J J J J J J J J J X X X X X X X X X X X X D D D D D D3uuZ  )),7788
",H
%
%
%
2
28
<
<_ _ _L B B B BLq q q q q q q q q qre   