
    3jC+              	          S r SSKrSSKJr  SSKJs  Jr  SS\R                  S\R                  S\S\R                  4S jjr	S\R                  S	\S\R                  4S
 jr
S\R                  S	\S\R                  4S jr " S S\R                  5      r " S S\R                  5      r  SS\R                  S\\   S-  SS4S jjrg)zQuantized layers for Gemma: INT2/4/8 packed-weight Linear and Embedding,
plus SRQ (Static Range Quantization) activation rounding.    Nxscalebitsreturnc                 n   UR                  U R                  5      nSUS-
  -  S-
  nU* S-
  nUS:g  n[        R                  " XQ[        R                  " U5      5      n[        R
                  " [        R                  " X-  5      [        U5      [        U5      5      U-  n[        R                  " XWU 5      $ )aK  Apply Static Range Quantization rounding and clipping (in x's dtype).

A `scale` of 0 means the layer is uncalibrated, in which case this is a no-op. The guard uses
`torch.where` rather than `scale.item()` so it stays on-device and `torch.compile`-friendly (an
`.item()` would force a host-device sync and break `fullgraph=True`).
      r   )todtypetorchwhere	ones_likeclamproundfloat)r   r   r   	max_value	min_value
calibrated
safe_scalex_qs           _/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/integrations/gemma_quant.py	apply_srqr      s     HHQWWEdQh!#I
QI!JZ0FGJ
++ekk!.153CU9EU
VYc
cC;;z**    packedoriginal_widthc                 P   U R                  [        R                  5      n U S-  R                  [        R                  5      S-
  nU S-	  R                  [        R                  5      S-
  n[        R                  " X#/SS9R
                  " / U R                  SS QSP76 nUSSU24   $ )a  Unpack int4 values from uint8 storage. Two values per byte.

Each byte: low nibble = first value, high nibble = second value.
Values are stored unsigned in [0, 15] and shifted to signed [-8, 7].
Cast to uint8 first so the right shift is logical, not arithmetic.
         dimN.r
   r   uint8int8stackreshapeshape)r   r   lowhighinterleaveds        r   _unpack_int4r,   '   s     YYu{{#FD=

UZZ
(1
,CaKEJJ'!+D++skr2::RFLL"<MRrRKsO^O+,,r   c                    U R                  [        R                  5      n U S-  R                  [        R                  5      S-
  nU S-	  S-  R                  [        R                  5      S-
  nU S-	  S-  R                  [        R                  5      S-
  nU S-	  R                  [        R                  5      S-
  n[        R                  " X#XE/SS9R
                  " / U R                  SS QSP76 nUSSU24   $ )	zUnpack int2 values from uint8 storage. Four values per byte.

Bits [1:0]/[3:2]/[5:4]/[7:6] hold values 0..3 each, shifted to signed [-2, 1].
   r   r      r    r!   N.r#   )r   r   v0v1v2v3r+   s          r   _unpack_int2r4   5   s    
 YYu{{#F
4-		EJJ	'!	+BQ;$
	"	"5::	.	2BQ;$
	"	"5::	.	2B
A+		%**	%	)B++rr.B7??WcrARWTVWKsO^O+,,r   c            	          ^  \ rS rSrSr  SS\S\S\S\4U 4S jjjrSS	\R                  S-  S
\R                  4S jjrS\R                  S
\R                  4S jrS
\4S jrSrU =r$ )QuantizedLinearC   zFLinear layer with INT2/4/8 packed weights and SRQ activation rounding.in_featuresout_featuresbiasnum_bitsc                   > [         TU ]  XUS9  X@l        US:X  a,  US-   S-  n[        R                  " X%[        R
                  S9nOUUS:X  a,  US-   S-  n[        R                  " X%[        R
                  S9nO#[        R                  " X![        R                  S9n[        R                  " USS9U l	        [        R                  " [        R                  " US[        R                  S95      U l        [        R                  " [        R                  " S	[        R                  S95      U l        [        R                  " [        R                  " S	[        R                  S95      U l        g )
N)r:   r   r.   r   r   r	   Frequires_gradg        )super__init__r;   r   emptyr$   r%   nn	Parameterweightonesfloat32weight_scaletensorinput_activation_scaleoutput_activation_scale)selfr8   r9   r:   r;   	packed_inweight_storage	__class__s          r   rA   QuantizedLinear.__init__F   s     	>  q=$qQ.I"[[TN]$qQ.I"[[TN"[[%**UNll>GLLL!5==)YZ ')ll5<<5==3Y&Z#')||ELLEMM4Z'[$r   Nr   r   c                 Z   U R                   S:X  a!  [        U R                  U R                  5      nO=U R                   S:X  a!  [	        U R                  U R                  5      nOU R                  nUc  X R
                  -  $ UR                  U5      U R
                  R                  U5      -  $ )u   Dequantize weights (handles int2/int4/int8 storage). If `dtype` is given,
the math runs in that dtype; otherwise int×fp32 promotion gives fp32.r   r   )r;   r4   rE   r8   r,   rH   r
   )rL   r   int_weightss      r   _dequantize_weights#QuantizedLinear._dequantize_weightsa   s     ==A&t{{D4D4DEK]]a&t{{D4D4DEK++K=!2!222~~e$t'8'8';';E'BBBr   r   c                     [        XR                  5      n[        R                  " XR	                  UR
                  5      U R                  5      n[        X R                  5      $ N)r   rJ   FlinearrS   r   r:   rK   )rL   r   outs      r   forwardQuantizedLinear.forwardn   sF    a445hhq22177;TYYG::;;r   c                 r    SU R                    SU R                   SU R                  S L SU R                   3$ )Nzin_features=z, out_features=z, bias=, num_bits=)r8   r9   r:   r;   rL   s    r   
extra_reprQuantizedLinear.extra_reprs   sG    4++,OD<M<M;N OIIT)*+dmm_F	
r   )rJ   r;   rK   rE   rH   )Fr   rV   )__name__
__module____qualname____firstlineno____doc__intboolrA   r   r   TensorrS   rZ   strr_   __static_attributes____classcell__rO   s   @r   r6   r6   C   s    P \\ \ 	\
 \ \6Ct); Cu|| C< <%,, <

C 
 
r   r6   c                   &  ^  \ rS rSrSr  SS\S\S\R                  S\S\4
U 4S jjjr	\
S	\R                  4S
 j5       rS\R                  S\R                  S	\R                  4S jrS\R                  S	\R                  4S jrS	\4S jrSrU =r$ )QuantizedEmbeddingz   a  Embedding with INT2/4/8 packed table, per-row dequant scale, and architectural embed_scale.

Does NOT subclass `nn.Embedding` because the packed-int storage isn't a usable
embedding table on its own: indexing `.embedding_quantized[idx]` returns packed
bytes, not a row of size `embedding_dim`. Callers expect `embed_tokens.weight[idx, :]`
to return the *dequantized* row, so we expose `weight` as a property (below)
that returns the dequantized table on demand.
num_embeddingsembedding_dimoutput_dtypeembed_scaler;   c                   > [         TU ]  5         Xl        X l        X@l        XPl        X0l        US:X  a,  US-   S-  n[        R                  " X[        R                  S9nOUUS:X  a,  US-   S-  n[        R                  " X[        R                  S9nO#[        R                  " X[        R                  S9n[        R                  " USS9U l        [        R                  " [        R                  " US[        R                  S95      U l        g )Nr   r.   r   r=   r	   Fr>   )r@   rA   rp   rq   scalar_embed_scaler;   rr   r   rB   r$   r%   rC   rD   embedding_quantizedrF   rG   embedding_scale)	rL   rp   rq   rr   rs   r;   
packed_dimembed_storagerO   s	           r   rA   QuantizedEmbedding.__init__   s     	,*"- ( q='!+1J!KK%++VM]'!+1J!KK%++VM!KKUZZXM#%<<U#S !||EJJ~qPUP]P],^_r   r   c                 N    U R                  U R                  U R                  5      $ )zDequantized embedding table (no architectural `embed_scale` applied).

Mirrors `nn.Embedding.weight` so callers can do `weight[idx, :]` and get
the same unscaled row they'd get from a non-quantized embedding.
)rS   rv   rw   r^   s    r   rE   QuantizedEmbedding.weight   s#     ''(@(@$BVBVWWr   
quant_rows
scale_rowsc                 d   U R                   S:X  a  [        XR                  5      nO(U R                   S:X  a  [        XR                  5      nOUnU R                  UR                  S   -  nUR                  USS9nUR                  U R                  5      UR                  U R                  5      -  $ )zFUnpack int2/int4/int8 + apply per-row block-wise dequantization scale.r   r   r    r!   )r;   r,   rq   r4   r(   repeat_interleaver
   rr   )rL   r}   r~   int_rows
block_sizer   s         r   rS   &QuantizedEmbedding._dequantize_weights   s    ==A#J0B0BCH]]a#J0B0BCH!H'':+;+;B+??
,,ZR,@{{4,,-9J9J0KKKr   	input_idsc                     U R                  U R                  U   U R                  U   5      nX R                  -  R	                  U R
                  5      $ rV   )rS   rv   rw   ru   r
   rr   )rL   r   results      r   rZ   QuantizedEmbedding.forward   sK    ))$*B*B9*MtOcOcdmOno00044T5F5FGGr   c                 n    SU R                    SU R                   SU R                   SU R                   3$ )Nznum_embeddings=z, embedding_dim=r]   z, embed_scale=)rp   rq   r;   ru   r^   s    r   r_   QuantizedEmbedding.extra_repr   sE    d1122B4CUCUBV WnT5L5L4MO	
r   )rq   rv   rw   r;   rp   rr   ru   )      ?r   )ra   rb   rc   rd   re   rf   r   r   r   rA   propertyrh   rE   rS   
LongTensorrZ   ri   r_   rj   rk   rl   s   @r   rn   rn   z   s     !`` ` kk	`
 ` ` `6 X X XLell L LY^YeYe LH!1!1 Hell H
C 
 
r   rn   modelmodules_to_not_convertc                 "  ^ SSK nSSKJn  UR                  nUR                  nUR
                  =(       d    0 n[        UR                  5       5       VV	s0 s H  u  pSU 3U	_M     sn	nmU(       a0  UR                  SR                  S [        U5       5       5      5      OSn
[        U R                  5       5       GHG  u  pU" X5      (       d  M  SU0nU
bK  U
R                  U5      =nb7  [        U4S	 jUR                  5       R                  5        5       5      n	SU0U	En[!        U["        R$                  5      (       aM  U(       d  M  ['        SUR(                  UR*                  [-        US
S5      UR.                  R0                  S.UD6nOR[!        U["        R2                  5      (       a0  [5        SUR6                  UR8                  UR:                  SLS.UD6nOGM%  UR=                  S5        U R?                  X5        GMJ     U $ s  sn	nf )aC  Replace `nn.Linear` / `nn.Embedding` modules with `QuantizedLinear` / `QuantizedEmbedding`.

Per-module bit widths come from `quantization_config.module_quant_configs`.
`nn.Embedding` modules are only replaced when `quantize_embeddings` is True.
Modules whose name matches an entry in `modules_to_not_convert` are skipped.
r   Nr   )should_convert_moduleg|c              3   8   #    U  H  u  pS U SU S3v   M     g7f)z(?P<g>)N ).0ipatterns      r   	<genexpr>,replace_with_quant_layers.<locals>.<genexpr>   s#     eEdzqeA3ay2Eds   r;   c              3   <   >#    U  H  u  pUc  M
  TU   v   M     g 7frV   r   )r   r   voverrides_by_groups      r   r   r      s!     i>Wda[\1.q1>Ws   	ru   r   )rp   rq   rs   rr   )r8   r9   r:   Fr   ) requantizers.quantizers_utilsr   quantize_embeddingsr;   module_quant_configs	enumeratevaluescompilejoinlistnamed_modulessearchnext	groupdictitems
isinstancerC   	Embeddingrn   rp   rq   getattrrE   r   Linearr6   r8   r9   r:   requires_grad_set_submodule)r   quantization_configr   r   r   r   r;   r   r   overridematchernamemoduleoptsmatch
new_moduler   s                   @r   replace_with_quant_layersr      s    C-AA"++H.CCIr
 @II]IdIdIf?gh?gAaS'8+?gh   	

388eYOcEdeef  U0023$TBBH%W^^D-A$AE#Nieoo>O>U>U>WiiH5H5Dfbll++&+ %44$22#F,@#F#]]00	
 J 		**( "..#00[[, 	J !!%(D-7 48 LG is   H)r   )NN)re   r   torch.nnrC   torch.nn.functional
functionalrW   rh   rf   r   r,   r4   r   r6   Modulern   r   ri   r   r   r   r   <module>r      s   =    + +ell +# +ell + - -s -u|| -- -s -u|| -4
bii 4
nC
 C
P /39999 !I,9 
	9r   