
    +jI                        S SK rS SKJr  S SKrS SKrS SKrS SKrS SKrS SK	J
r
  S SKJr  S SKJrJr  S SKJr  S SKJrJr  S SKJr  S SKJrJrJr  S S	KJr  S S
KJrJr  S SKJrJ r J!r!  S SK"J#r#J$r$J%r%J&r&  SSK'J(r(  \)" \" S5      RU                  S5      5      r+\RX                  " \+S9R[                  S5      r.S r/    SS jr0 " S S\5      r1g)    N)files)stderr)infer_auto_device_mapdispatch_model)get_balanced_memory)	load_file	save_file)tqdm)OptionalDictUnion)DFloat11Model)TensorManagerget_no_split_classes)versionthreads_per_blockbytes_per_thread)	get_codecget_32bit_codecget_lutsencode_weights   )!convert_diffusers_to_comfyui_fluxdfloat11z
decode.ptx)pathdecodec                 .   ^ ^ [        T 5      m UU 4S jnU$ )a  
Creates a PyTorch forward pre-hook that decodes compressed DFloat11 weights on-the-fly.

This hook reconstructs full-precision weights from compressed representations
using a custom CUDA kernel during the forward pass.

Args:
    threads_per_block: CUDA thread configuration 
    bytes_per_thread: Number of bytes processed per CUDA thread
    
Returns:
    A forward pre-hook function for PyTorch modules
c                   > U R                   R                  n[        U S5      (       am  U R                  R	                  5        HO  u  p4[        X5      (       a  [        X5      R                  U:X  a  M0  U R                  X4R                  USS95        MQ     U R                  R                  5       nU R                  R                  5       nU R                   R                  S   n[        R                  " X%5      n[        [        R                   " UTS   T-  -  5      5      4n	["        R$                  R'                  UR(                  5         [+        U	TU R,                  U R                   R/                  5       U R                  R/                  5       U R                  R/                  5       U R0                  R/                  5       U R2                  R/                  5       UR/                  5       XvU/	S9  S S S 5        [5        U R6                  5      S:X  GaA  [8        R:                  " USS USS	 USS US	S  45      US S & [8        R<                  " XR>                  5      n
U
S   RA                  U R6                  S   RB                  U R6                  S   RD                  5      U R6                  S   l#        US
S RA                  SS5      U R6                  S   l#        U
S   RA                  U R6                  S   RB                  U R6                  S   RD                  5      U R6                  S   l#        U
S   RA                  U R6                  S   RB                  U R6                  S   RD                  5      U R6                  S   l#        U
S   RA                  U R6                  S   RB                  U R6                  S   RD                  5      U R6                  S   l#        U
S   RA                  U R6                  S   RB                  U R6                  S   RD                  5      U R6                  S   l#        USS	 RA                  SS5      U R6                  S   l#        U
S   RA                  U R6                  S   RB                  U R6                  S   RD                  5      U R6                  S   l#        U
S   RA                  U R6                  S   RB                  U R6                  S   RD                  5      U R6                  S   l#        U
S   RA                  U R6                  S   RB                  U R6                  S   RD                  5      U R6                  S   l#        GO3[5        U R6                  5      S:X  a  [8        R:                  " US
S USS USS
 USS 45      US S & [8        R<                  " XR>                  5      n
USS RA                  SS5      U R6                  S   l#        U
S   RA                  U R6                  S   RB                  U R6                  S   RD                  5      U R6                  S   l#        US
S RA                  SS5      U R6                  S   l#        O-[I        [5        U R6                  5       SU R6                   35      e[        U S5      (       aK  U R                  RK                  5        H,  n[        X5      (       d  M  [        X5      n[M        X5        AM.     g g ! , (       d  f       GN= f)Noffloaded_tensorsT)non_blockingr   )gridblock
shared_memargs
   i  pi  	i   
i  i $  i   r                        	            i  i  i T  z weight_injection_modules 
)'lutsdevicehasattrr   itemsgetattrregister_buffertosign_mantissanumelencoded_exponentshaper   allocate_bfloat16intmathceilcpcudaDeviceindex_decodeshared_mem_sizedata_ptroutput_positionsgapslenweight_injection_modulestorchcattensor_splitsplit_positionsviewout_featuresin_featuresweight	Exceptionkeysdelattr)module_r2   tensor_nametensor
n_elementsn_bytesn_lutsreconstructedblocks_per_gridweightstmpr   r   s               Y/home/wildlama/comfy/ComfyUI/custom_nodes/ComfyUI-DFloat11-Extended/dfloat11_diffusers.pydecode_hook,get_hook_flux_diffusers.<locals>.decode_hook4   sx   ## 6.//'-'?'?'E'E'G#F00gf6R6Y6Y]c6c**;		&W[	8\]	 (H ))//1
))//1""1% &77K tyy4Ea4HK[4[)\]^a WW^^FLL)0AfNdNd$$&''002$$--/''002$$&&&(l  * v../25  %yy-)*DmT]^gFhjw  yB  CL  kM  O\  ]f  ]g  Oh  *i   jM!((8N8NOG8?
HgHghiHjHwHwy  zY  zY  Z[  z\  zh  zh  9iF++A.58EiR[8\8a8abfhl8mF++A.58?
HgHghiHjHwHwy  zY  zY  Z[  z\  zh  zh  9iF++A.58?8H8HIhIhijIkIxIx  {A  {Z  {Z  [\  {]  {i  {i  9jF++A.58?8H8HIhIhijIkIxIx  {A  {Z  {Z  [\  {]  {i  {i  9jF++A.58?
HgHghiHjHwHwy  zY  zY  Z[  z\  zh  zh  9iF++A.58EiR[8\8a8abfhl8mF++A.58?
HgHghiHjHwHwy  zY  zY  Z[  z\  zh  zh  9iF++A.58?8H8HIhIhijIkIxIx  {A  {Z  {Z  [\  {]  {i  {i  9jF++A.58?8H8HIhIhijIkIxIx  {A  {Z  {Z  [\  {]  {i  {i  9jF++A.5 001Q6  %yy-	)*Lm\demNnp}  G  HQ  qR  Ta  bc  dl  Tm  *n   oM!((8N8NOG8Ea8Q8V8VW\^b8cF++A.58?
HgHghiHjHwHwy  zY  zY  Z[  z\  zh  zh  9iF++A.58EiPY8Z8_8_`dfj8kF++A.5 s6#B#BCDD`ag  bA  bA  aB  C  D  D 6.//%77<<>6//!&6CF0	  ? 0c *)s   B%[##
[2)tuple)r   r   rb   s   `` ra   get_hook_flux_diffusersre   $   s      /0M^     c                 *
   US   nUS   nUS   n	U(       d>  [         R                  " U5       V
s/ s H  oR                  S5      (       d  M  U
PM     sn
OU/nSnU(       a  US-  nU(       a  US-  nUS-  n[        XS	9 GH  nU(       d  [         R                  R                  X5      OUn[        [        U5      5      nUR                  5        GH4  u  nnUU R                  5       ;   Ga  U[        U R                  5       5      ;   a  [        U R                  5       5      U   nUR                  UR                  :X  a  UR                  R                  U5        M  [        S
U SUR                   SUR                   3[         S9  M  [        U R#                  5       5      U   nUR                  UR                  :X  a  UR                  U5        GM  [        S
U SUR                   SUR                   3[         S9  GM1  UR%                  S5      nU n['        USS 5       H8  u  nn[)        UU5      (       a  [+        UU5      nM%  [        SU 3[         S9    GM     US   S:X  a  [-        USUR/                  5       5        OU(       a  Ub  US:  a  US   [0        ;   a|  [)        US5      (       d  [-        US0 5        U(       a  UR3                  5       OUUR4                  US   '   Ub1  US:  a+  [7        UR4                  5      [7        [0        5      :X  a  US-  nOUR9                  US   U5        US   S:X  GaO  UR;                  [=        Xx5      5        U	R                  5        GH  u  nn[>        R@                  " USR                  USS 5      5      (       d  M7  [C        U[D        RF                  5      (       a  URH                  n[K        US5        AMq  [C        U[D        RL                  5      (       a  URH                  n[K        US5        AM  [-        US/ 5        U H_  nUR%                  S5      nUnU H  n[+        UU5      nM     URH                  n[K        US5        AURN                  RQ                  U5        Ma     GM      GM  US   S:X  d  GM  URS                  [T        RV                  5      RY                  5       n[-        USUS   S-  S-   USS USS -
  R[                  5       R]                  5       S-  -   5        GM7     GM     U $ s  sn
f )a{  
Loads DFloat11 compressed weights from safetensors files and configures the model
to use them with on-the-fly decompression.

Args:
    model: The PyTorch model to load weights into
    directory_path: Path to the directory containing safetensors files
    dfloat11_config: Configuration for DFloat11 compression
    
Returns:
    The model with configured DFloat11 compression
r   r   pattern_dictz.safetensorszLoading DFloat11 safetensorsz (offloaded to CPUz, memory pinned))desczShape mismatch for z: model z vs loaded file.NzCannot find module path for rN   r   r   r   r:   rR   rJ   rG   rE   r*   r'   )/oslistdirendswithr
   r   joinr   r   r4   
state_dictdictnamed_parametersr;   datacopy_printr   named_bufferssplit	enumerater3   r5   setattrtolistoffloaded_tensor_names
pin_memoryr   rI   r6   register_forward_pre_hookre   re	fullmatch
isinstancenn	EmbeddingrR   rU   LinearrJ   appendrO   rK   uint32numpymaxitem)modeldirectory_pathdfloat11_configcpu_offloadcpu_offload_blocksr   from_single_filer   r   rh   fsafetensors_filesloading_desc	file_name	file_pathloaded_tensorsrX   tensor_valueparambufferpartsrV   ipartpattern
attr_namesr`   	attr_pathtargetpoutput_positions_nps                                  ra   'load_and_replace_tensors_flux_diffusersr      s   * ((;<'(:;'7L
  ::n--aN1K-$2#3  2L,,--L+?	CSBGGLL;Yb	 ;9Y;OP *8)=)=)?%Ke..00$u'='='?"@@ !7!7!9:;GE{{l&8&88

((6 3K=U`amasas`tu  }C  D "%"5"5"78EF|||'9'99\2 3K=Vabnbtbtauv  ~D  E $))#.  )s4GAtvt,,!(!6 <[MJQWX  5 Ry$55(9<;N;N;PQ&,>,FJ\_`J`fklnfo  tJ  gJ#*63F#G#G '0CR H_i,BYBYB[o{F44U2Y? 2 >EWZ[E[beflf~f~b  DG  H^  D_  c_ 2a 7 2 #2259lK Ry$66889PQb9uv 4@3E3E3G/GZ!||GSXXeCRj5IJJ#-fbll#C#C*0--C$+FH$=(+%/		%B%B*0--C$+FH$=(+ %,F4NPR$S5?	090D1716A5<VQ5GF 27 /5mm(/(A,/(.(G(G(N(Nv(V 6@ 4H4 r&88.:.?.?.M.S.S.U+"--a014q8<OPQPR<SVijmkmVn<n;s;s;u;z;z;|  @A  <A  Ac *@ @| LQs   TTc                      ^  \ rS rSrU 4S jr\         SS\S\\   S\S\\\	\
\4   \	\
\4   4      S\S\\
   S	\S
\S\\\\\   4      4S jj5       rSrU =r$ )DFloat11FluxDiffusersModeli  c                 "   > [         TU ]  5         g )N)super__init__)self	__class__s    ra   r   #DFloat11FluxDiffusersModel.__init__  s    rf   dfloat11_model_name_or_pathr2   
device_map
max_memoryr   r   r   r   rh   c           
      @   U	(       ai  [         R                  R                  U5      (       a  UnO[         R                  R                  U5      (       a  [	        SU S35      e[        SU S35      e[         R                  R                  U5      (       a  UnO?UR                  SS5      n[         R                  R                  U5      (       d	  [        XS9  [        S5        U(       ag  U	(       a  S	[        [        [        U
S
.0nOG[        [         R                  R                  US5      SSS9 n[        R!                  U5      nSSS5        UnO[#        S5      e[%        W[&        5      (       a  S	U;   a  US	   nO)[)        US	5      (       a  UR*                  nO[-        S5      e[/        XUXgXS9  U(       dL  SnUR1                  5       R3                  5        H  nUUR4                  -  nM     [        SUS-  S S3[6        S9  U(       a  UR9                  U5      nU$ US:X  d   S5       e[;        UUS   5      n[=        XUS9n[?        XUS9n[A        X5      n[C        S URE                  5        5       5      (       a  [        S[6        S9  U$ ! , (       d  f       GNP= f)a  
Load a model with DFloat11 compressed weights from local path or Hugging Face Hub.

Args:
    dfloat11_model_name_or_path: Local path or HF Hub model name
    device: Target device for the model
    device_map: Strategy for distributing model across devices
    max_memory: Maximum memory allocation per device
    bfloat16_model: Optional pre-initialized model to load weights into
    cpu_offload: Enables CPU offloading; only keeps a single block of weights in GPU at once
    cpu_offload_blocks: Number of transformer blocks to offload to CPU; if None, offload all blocks
    pin_memory: Enables memory-pinning/page-locking when using CPU offloading
    from_single_file: Whether to load a single safetensors file
    pattern_dict: Dictionary mapping regex patterns to submodule lists
    **kwargs: Additional arguments passed to AutoModelForCausalLM.from_config
    
Returns:
    Model with DFloat11 compressed weights configured for on-the-fly decompression
zeExpected `dfloat11_model_name_or_path` to be the path to a safetensors file, but found a directory: "z".z
The file "z" does not exist./__)	local_dirz0Using overriden DFloat11FluxDiffusersModel classr   )r   r   r   rh   zconfig.jsonrzutf-8)encodingNz"`bfloat16_model` must be specifiedzd"dfloat11_config" not found: it is expected to be found in the config file or passed as an argument.)r   r   r   r   r   zTotal model size: g    eAz0.4fz GBrk   autoz>device_map should be 'auto' if no specific device is provided.rh   )r   no_split_module_classesc              3   R   #    U  H  oR                   R                  S :H  v   M     g7f)cpuN)r2   type).0r   s     ra   	<genexpr>=DFloat11FluxDiffusersModel.from_pretrained.<locals>.<genexpr>z  s     N;M%<<$$-;Ms   %'zqWarning: Some model layers are on CPU. For inference, ensure the model is fully loaded onto CUDA-compatible GPUs.)#ro   r   isfileisdirIsADirectoryErrorFileNotFoundErrorexistsreplacesnapshot_downloadrx   r   r   r   openrr   jsonloadrS   r   rt   r3   r   AttributeErrorr   rs   valuesnbytesr   r7   r   r   r   r   any
parameters)clsr   r2   r   r   bfloat16_modelr   r   r   r   rh   kwargsdfloat11_model_pathconfigr   r   r   model_bytesr   no_split_classess                       ra   from_pretrained*DFloat11FluxDiffusersModel.from_pretrained  s   F ww~~9::&A#:;;'  +P  Ql  Pm  mo  )p  q  q'*5P4QQb(cddww~~9::&A#&A&I&I#t&T#ww~~&9::%&Aa@A %#*->,<(4	( "'',,':MJCZabfg!YYq\F c #E@AA fd##(9V(C$%67OV.//$44O   "H  I  I 	0#!	
 K))+224u||+ 5 &{S'8&>cBP HHV$E  'i)ii'3E?>;Z[,UcstJ.ueuvJ"55E N5;K;K;MNNN  J  QW  X[ cbs   /J
J )	Nr   NNFNTFN)__name__
__module____qualname____firstlineno__r   classmethodstrr   r   r   r=   boolrt   listr   __static_attributes____classcell__)r   s   @ra   r   r     s      !% GK!,0!&7;l%(l l 	l
 T%S/5c?"BCDl l %SMl l l tCcN34l lrf   r   )FNTF)2cupyr@   importlib.resourcesr   r   r>   uuidro   rK   torch.nnr   sysr   
accelerater   r   accelerate.utilsr   safetensors.torchr   r	   r
   typingr   r   r   r   r   dfloat11.dfloat11r   r   r   r   r   dfloat11.dfloat11_utilsr   r   r   r   convert_fixed_tensorsr   r   joinpathptx_path	RawModuleget_functionrD   re   r   r   r   rf   ra   <module>r      s     % 	   	    < 0 2  ( ( " A J J X X DuZ )),78
,,H
%
2
28
<_L BLq qrf   