
    
3j	u                        S r SSKrSSKJrJrJrJr  SSKrSSKJ	r	  SSK
J	s  Jr  SSKJrJr  SSKJr  SSKJr  \R(                  " \5      r " S S	\	R.                  5      rS
\\\	R.                  \4   S\	R.                  S\\   S\\R4                     S\\R6                     S\4S jr " S S\5      r " S S\	R.                  5      r " S S\5      rg)z5Knowledge distillation training tasks and components.    N)DictOptionalTupleUnion)create_modelgroup_parameters)unwrap_model   )TrainingTaskc                     ^  \ rS rSrSr     SS\\\R                  4   S\	\
   S\
S\	\   S\	\R                     S\	\R                     4U 4S	 jjjrS
\R                  S\R                  4S jr    SS\S\	\   S\S\SS 4
S jjr SS
\R                  S\S\R                  4S jjr  SS
\R                  S\	\R                     S\	\R                     S\R                  4S jjrSrU =r$ )DistillationTeacher   a1  Wrapper for a teacher model used in knowledge distillation.

Creates and manages a pre-trained teacher model for knowledge distillation,
handling model creation and normalization differences between teacher and student.

Can be created from:
- A model name string (creates the model internally with pretrained weights)
- An existing nn.Module (wraps it with the necessary interface)

Args:
    model_name_or_module: Either a model name string or an nn.Module
    num_classes: Number of output classes (required if model_name_or_module is a string)
    in_chans: Number of input channels (used if model_name_or_module is a string)
    pretrained_path: Optional path to pretrained weights (used if model_name_or_module is a string)
    device: Device to place the model on
    dtype: Model dtype (uses float32 if None)
model_name_or_modulenum_classesin_chanspretrained_pathdevicedtypec           	      $  > [         TU ]  5         [        U[        5      (       aC  [        R                  SU S35        SS0nU(       a  [        UUS9US'   [        SUUUUUS.UD6nOC[        U[        R                  5      (       a  UnO![        S[        U5      R                   35      eUR                  5         Xl        [        U5      n	[!        U	S	5      (       a9  U	R"                  R%                  S
S5      n
U	R"                  R%                  SS5      nOSn
Sn[&        R(                  " XUS9R+                  SSSS5      n[&        R(                  " XUS9R+                  SSSS5      nU R-                  SUSS9  U R-                  SUSS9  S U l        g )NzCreating KD teacher model: ''
pretrainedT)filer   pretrained_cfg_overlay)
model_namer   r   r   r   z8model_name_or_module must be a string or nn.Module, got pretrained_cfgmean)g
ףp=
?gv/?gCl?std)gZd;O?gy&1?g?r   r   r
   mean_kdF
persistentstd_kd )super__init__
isinstancestr_loggerinfodictr   nnModule	TypeErrortype__name__evalmodelr	   hasattrr   gettorchtensorviewregister_buffer_compiled_forward_features)selfr   r   r   r   r   r   pretrained_kwargsr2   model_unwrappedr   r   r    r#   	__class__s                 P/home/wildlama/miniconda3/lib/python3.13/site-packages/timm/task/distillation.pyr&   DistillationTeacher.__init__%   s    	*C00LL78L7MQOP!-t 4>B( +?!":;
 ! /'! $E ,bii88(EJ4PdKeKnKnJop  	


 'u-?$455"1155f>STD!0044U<QRC(D'C,,t%@EEaQPQRc>CCAr1aPYEBXv%@*.'    inputreturnc                 2   [        U R                  S5      (       a  [        U R                  S5      (       d-  [        SU R                  R                  R                   S35      eU R                  R                  U5      nU R                  R                  USS9$ )Nforward_featuresforward_headzModel zi does not support feature extraction. Ensure the model has 'forward_features' and 'forward_head' methods.T
pre_logits)r3   r2   
ValueErrorr=   r0   rD   rE   )r:   rA   feature_maps      r>   _forward_features%DistillationTeacher._forward_features[   s    tzz#566gdjjR`>a>a--667 8V V  jj11%8zz&&{t&DDr@   backendmodecompile_modelcompile_featuresc                     U(       a(  [         R                  " U R                  4XS.UD6U l        U(       a)  [         R                  " U R                  4UUS.UD6U l        U $ )z;Compile teacher inference paths used by distillation tasks.rL   rM   )r5   compiler2   rJ   r9   )r:   rL   rM   rN   rO   compile_kwargss         r>   rR   DistillationTeacher.compiled   s_     tzz`7`Q_`DJ.3mm&&// !	/D+ r@   return_featuresc                     U(       a/  U R                   b  U R                  U5      $ U R                  U5      $ U R                  U5      $ )a  Forward pass through teacher model.

Args:
    input: Input tensor (should already be normalized for teacher)
    return_features: Whether to return pooled pre-logits features instead of logits

Returns:
    Logits or pooled pre-logits features depending on return_features flag
)r9   rJ   r2   )r:   rA   rU   s      r>   forwardDistillationTeacher.forwardx   sD     ..:66u==))%00::e$$r@   student_meanstudent_stdc                     Ub  Uc  U$ [         R                  " X R                  5      (       a'  [         R                  " X0R                  5      (       a  U$ X-  U-   U R                  -
  U R                  -  $ )a7  Normalize input to match teacher's expected normalization.

Args:
    input: Input tensor (already normalized for student)
    student_mean: Student normalization mean buffer [1, 3, 1, 1]
    student_std: Student normalization std buffer [1, 3, 1, 1]

Returns:
    Input tensor normalized for the teacher model
)r5   equalr    r#   )r:   rA   rY   rZ   s       r>   normalize_input#DistillationTeacher.normalize_input   s`      ;#6L;;|\\22u{{;P[P[7\7\L#l2T\\AT[[PPr@   )r9   r2   )N   NNN)inductorNTFF)NN)r0   
__module____qualname____firstlineno____doc__r   r(   r,   r-   r   intr5   r   r   r&   TensorrJ   boolrR   rW   r]   __static_attributes____classcell__r=   s   @r>   r   r      sp   * *.-1-1+/4/"'RYY"74/ "#4/ 	4/
 &c]4/ U\\*4/ EKK(4/ 4/lEu|| E E &"&"&%* 3-  	
 # 
. %*%<<% "% 
	%0 4826	Q<<Q #5<<0Q "%,,/	Q
 
Q Qr@   r   teacherstudent_modelr   r   r   rB   c           	          [        U [        5      (       a  U $ [        U5      nUR                  nUR                  n[        U UUUUUS9$ )an  Resolve teacher input to a DistillationTeacher instance.

Args:
    teacher: Model name string, nn.Module, or DistillationTeacher
    student_model: Student model to infer num_classes/in_chans from
    pretrained_path: Optional path to teacher pretrained weights
    device: Device for teacher
    dtype: Dtype for teacher

Returns:
    DistillationTeacher instance
)r   r   r   r   r   r   )r'   r   r	   r   r   )rl   rm   r   r   r   student_unwrappedr   r   s           r>   _resolve_teacherrp      sY    & '.// %]3#//K ))H$' r@   c                     ^  \ rS rSrSr         SS\R                  S\\\R                  \	4   S\
\R                     S\
\   S\S\
\   S	\
\   S
\S\
\R                     S\
\R                     S\4U 4S jjjr SS\
\   SS 4S jjr  SS\S\
\   S\R                  4S jjrS\R(                  S\R(                  S\\\R(                  4   4S jrSrU =r$ )LogitDistillationTask   a  Logit-based knowledge distillation task.

Performs distillation by matching student and teacher output logits using
KL divergence with temperature scaling.

Loss weighting supports two modes:
1. Independent weights: loss = task_loss_weight * task_loss + distill_loss_weight * distill_loss
2. Complementary mode: loss = task_loss_weight * task_loss + (1 - task_loss_weight) * distill_loss
   (used when only task_loss_weight is specified)

Args:
    student_model: Student model to train
    teacher_model: Teacher model - can be a model name string, nn.Module, or DistillationTeacher
    criterion: Task loss function (default: CrossEntropyLoss)
    teacher_pretrained_path: Path to teacher pretrained weights (used when teacher_model is a string)
    loss_type: Type of distillation loss (currently only 'kl' supported)
    distill_loss_weight: Weight for distillation loss
    task_loss_weight: Weight for task loss
    temperature: Softmax temperature for distillation (typical values: 1-4)
    device: Device for task tensors/buffers
    dtype: Dtype for task tensors/buffers
    verbose: Enable info logging

Example:
    >>> # With model name string (num_classes/in_chans inferred from student)
    >>> task = LogitDistillationTask(
    ...     student_model=model, teacher_model='resnet50',
    ...     criterion=nn.CrossEntropyLoss(),
    ...     task_loss_weight=0.3, temperature=4.0,
    ...     device=torch.device('cuda'),
    ... )
    >>> # With raw model
    >>> task = LogitDistillationTask(
    ...     student_model=model, teacher_model=my_teacher_model,
    ...     criterion=nn.CrossEntropyLoss(),
    ...     task_loss_weight=0.3, temperature=4.0,
    ... )
rm   teacher_model	criterionteacher_pretrained_path	loss_typedistill_loss_weighttask_loss_weighttemperaturer   r   verbosec                   > [         TU ]  XUS9  [        UUUU R                  U R                  5      nXl        Xl        Ub  UO[        R                  " 5       U l	        XPl
        Xl        US:w  a  [        SU S35      e[        U5      n[        R                  " UR                   S   U R                  U R                  S9R#                  SSSS5      n[        R                  " UR                   S	   U R                  U R                  S9R#                  SSSS5      nU R%                  S
USS9  U R%                  SUSS9  UbI  X`l        Ub  UOSU l        U R*                  (       a%  [,        R/                  SU R(                   SU 35        OUbG  Xpl        SU-
  U l        U R*                  (       a%  [,        R/                  SU SU R&                   35        ONSU l        SU l        U R*                  (       a/  [,        R/                  SU R(                   SU R&                   35        U R*                  (       a  [,        R/                  SU SU 35        g g )Nr   r   r{   klzUnsupported loss_type 'z$'. Currently only 'kl' is supported.r   r   r
   r   r   rY   Fr!   rZ         ?z9LogitDistillationTask: Independent weights - task_weight=, distill_weight=z8LogitDistillationTask: Complementary mode - task_weight=z;LogitDistillationTask: Default equal weights - task_weight=z!LogitDistillationTask: loss_type=z, temperature=)r%   r&   rp   r   r   trainable_modulerl   r,   CrossEntropyLossru   rw   rz   rH   r	   r5   r6   r   r7   r8   rx   ry   r{   r)   r*   )r:   rm   rt   ru   rv   rw   rx   ry   rz   r   r   r{   rl   ro   rY   rZ   r=   s                   r>   r&   LogitDistillationTask.__init__   sV    	WE ##KKJJ
 !.&/&;ATATAV"&6ykAefgg )7||,,V4;;**
 $q"a
	 	
 ll,,U3;;**
 $q"a
	 	
 	^\eL]KEJ *':$8H8T$4Z]D!||##'#8#8"99JK^J_a )$4!'*-='=D$||##3"44EdF^F^E_a (+D$$'D!||##'#8#8"99J4KcKcJdf
 <<LL3I;n[MZ r@   
device_idsrB   c                     SSK Jn  U R                  R                  5        H
  nSUl        M     U" U R
                  4SU0UD6U l        U $ )aL  Prepare task for distributed training.

Wraps the student model in DistributedDataParallel (DDP) while leaving
the frozen teacher model unwrapped.

Args:
    device_ids: List of device IDs for DDP (e.g., [local_rank])
    **ddp_kwargs: Additional arguments passed to DistributedDataParallel

Returns:
    self (for method chaining)
r   DistributedDataParallelFr   torch.nn.parallelr   rl   
parametersrequires_gradr   r:   r   
ddp_kwargsDDPparams        r>   prepare_distributed)LogitDistillationTask.prepare_distributedD  L    " 	E\\,,.E"'E / !$D$9$9 _j _T^ _r@   rL   rM   c                     [         R                  " U R                  4XS.UD6U l        U R                  U l        U R                  R                  " SUUSSS.UD6  U R                  $ )z=Compile student eval/train forward and teacher logit forward.rQ   TFrL   rM   rN   rO   r$   )r5   rR   r   
eval_modelrl   )r:   rL   rM   rS   s       r>   rR   LogitDistillationTask.compile]  sn     !&d.C.C rW rcq r// 	
"		

 	
 $$$r@   rA   targetc                 ^   U R                  U5      nU R                  X25      n[        R                  " 5          U R                  R                  XR                  U R                  5      nU R	                  UR                  5       SS9nSSS5        [        R                  " X0R                  -  SS9n[        R                  " WU R                  -  SS9n[        R                  " XxSSS9U R                  S	-  -  n	U R                  U-  U R                  U	-  -   n
U
UUU	S
.$ ! , (       d  f       N= f)aq  Forward pass with logit distillation.

Args:
    input: Input tensor [B, C, H, W]
    target: Target labels [B]

Returns:
    Dictionary containing:
        - 'loss': Combined training loss (task + distillation)
        - 'output': Student logits (for metrics)
        - 'task_loss': Classification loss component
        - 'kd_loss': Logit distillation loss component
FrU   Nr   )dim	batchmeanT)	reduction
log_target   lossoutput	task_losskd_loss)r   ru   r5   no_gradrl   r]   rY   rZ   detachFlog_softmaxrz   kl_divry   rx   )r:   rA   r   student_logitsr   input_kdteacher_logitsprob_sprob_tr   
total_losss              r>   rW   LogitDistillationTask.forwardp  s   $ ..u5NN>:	]]_||33E;L;LdN^N^_H!\\(//*;U\SN  ~0@0@@bI~0@0@@bI((6[TRVZVfVfjkVkl**Y69Q9QT[9[[
 $"	
 	
 _s   AD
D,)ru   rx   r   rw   ry   rl   rz   r   )	NNr~   NNr   NNTNr`   N)r0   rb   rc   rd   re   r,   r-   r   r(   r   r   floatr5   r   r   rh   r&   listr   rR   rg   r   rW   ri   rj   rk   s   @r>   rr   rr      ss   %V .259!3704!$-1+/ Q99Q !bii1D!DEQ  		*	Q
 &.c]Q Q "*%Q 'uoQ Q U\\*Q EKK(Q Q Qj *.  
!	6 &"&%% 3-%
 
%&$
<<$
 LL$
 
c5<<	 	$
 $
r@   rr   c                      ^  \ rS rSrSr SS\R                  S\\R                     4U 4S jjjrS r	SS\
4S jjrS	\R                  S
\\R                  \R                  4   4S jrSrU =r$ )"FeatureDistillationTrainableModulei  a  Trainable module for feature distillation.

Wraps student model and projection layer into a single module where all
trainable forward operations happen inside forward(). This ensures proper
DDP wrapping when the module is used with DistributedDataParallel.
rm   
projectionc                 :   > [         TU ]  5         Xl        X l        g)zCreate trainable module wrapper for feature distillation.

Args:
    student_model: Student model to train
    projection: Optional projection layer (Linear layer or None)
N)r%   r&   studentr   )r:   rm   r   r=   s      r>   r&   +FeatureDistillationTrainableModule.__init__  s     	$$r@   c                     [        U R                  5      n[        US5      (       d
  [        5       $ UR	                  5        Vs1 s H  nSU-   iM
     sn$ s  snf )Nno_weight_decaystudent.)r	   r   r3   setr   )r:   r   names      r>   r   2FeatureDistillationTrainableModule.no_weight_decay  sM    t||,w 1225L.5.E.E.GH.Gd
T!.GHHHs   Acoarsec                    ^^ [        U R                  5      n[        US5      (       d  0 $ [        X"R	                  US9SS9m[        TR                  5       SS9mUU4S jnU$ )Ngroup_matcher)r   T)reverser   )defaultc                 p   > U R                  S5      (       a  TR                  U [        S5      S  T5      $ T$ )Nr   )
startswithr4   len)r   student_layer_map
task_layers    r>   _matcherBFeatureDistillationTrainableModule.group_matcher.<locals>._matcher  s7    z**(,,T#j/2B-CZPPr@   )r	   r   r3   r   r   maxvalues)r:   r   r   r   r   r   s       @@r>   r   0FeatureDistillationTrainableModule.group_matcher  sb    t||,w00I,W6K6KSY6K6Zdhi*113Q?
	
 r@   rA   rB   c                     U R                   R                  U5      nU R                   R                  U5      nU R                   R                  USS9nU R                  b  U R                  U5      nX44$ )zForward pass through student and projection.

Args:
    input: Input tensor [B, C, H, W]

Returns:
    Tuple of (student_logits, student_features) where features are
    optionally projected to match teacher dimension.
TrF   )r   rD   rE   r   )r:   rA   rI   r   student_featuress        r>   rW   *FeatureDistillationTrainableModule.forward  si     ll33E:22;?<<44[T4R??&#/?@//r@   )r   r   r   ra   )r0   rb   rc   rd   re   r,   r-   r   r&   r   rh   r   r5   rg   r   rW   ri   rj   rk   s   @r>   r   r     sv     /3%99% !+% %ID 0U\\ 0eELL%,,4N.O 0 0r@   r   c                   *  ^  \ rS rSrSr         S%S\R                  S\\\R                  \	4   S\
\R                     S\
\   S\
\   S	\
\   S
\
\   S\
\   S\
\R                     S\
\R                     S\4U 4S jjjr\S\R                  S\4S j5       r S&S\
\   SS 4S jjr  S'S\S\
\   S\R                  4S jjrS(S\
\R                     S\S\
\R                     4U 4S jjjr  S(S\
\R                     S\S\\\\\R2                  4   4   4S jjrS)S\4S jjr   S*S\
\\\\\R2                  4   4      S\S\
\R                     S\SS4
S  jjrS!\R2                  S"\R2                  S\\\R2                  4   4S# jrS$rU =r$ )+FeatureDistillationTaski  a  Feature-based knowledge distillation task.

Performs distillation by matching student and teacher intermediate features
(pooled pre-logits) using MSE loss. Automatically creates a projection layer
if student and teacher feature dimensions differ.

Loss weighting supports two modes:
1. Independent weights: loss = task_loss_weight * task_loss + distill_loss_weight * distill_loss
2. Complementary mode: loss = task_loss_weight * task_loss + (1 - task_loss_weight) * distill_loss
   (used when only task_loss_weight is specified)

Args:
    student_model: Student model to train
    teacher_model: Teacher model - can be a model name string, nn.Module, or DistillationTeacher
    criterion: Task loss function (default: CrossEntropyLoss)
    teacher_pretrained_path: Path to teacher pretrained weights (used when teacher_model is a string)
    distill_loss_weight: Weight for distillation loss
    task_loss_weight: Weight for task loss
    student_feature_dim: Student pre-logits dimension (auto-detected if None)
    teacher_feature_dim: Teacher pre-logits dimension (auto-detected if None)
    device: Device for task tensors/buffers
    dtype: Dtype for task tensors/buffers
    verbose: Enable info logging

Example:
    >>> # With model name string (num_classes/in_chans inferred from student)
    >>> task = FeatureDistillationTask(
    ...     student_model=model, teacher_model='resnet50',
    ...     criterion=nn.CrossEntropyLoss(),
    ...     distill_loss_weight=5.0, task_loss_weight=1.0,
    ...     device=torch.device('cuda'),
    ... )
Nrm   rt   ru   rv   rx   ry   student_feature_dimteacher_feature_dimr   r   r{   c                   > [         TU ]  XUS9  [        UUUU R                  U R                  5      nXl        Ub  UO[        R                  " 5       U l        UbI  XPl	        Ub  UOSU l
        U R                  (       a%  [        R                  SU R                   SU 35        OUbG  X`l
        SU-
  U l	        U R                  (       a%  [        R                  SU SU R                   35        ONSU l	        SU l
        U R                  (       a/  [        R                  SU R                   SU R                   35        Uc  U R                  U5      nUc  U R                  UR                  5      nS nXx:w  aW  U R                  (       a  [        R                  SU SU 35        [        R                   " XxU R                  U R                  S	9nO&U R                  (       a  [        R                  S
5        [#        X5      U l        ['        U5      n[(        R*                  " UR,                  S   U R                  U R                  S	9R/                  SSSS5      n[(        R*                  " UR,                  S   U R                  U R                  S	9R/                  SSSS5      nU R1                  SUSS9  U R1                  SUSS9  U R                  (       a  [        R                  SU SU 35        g g )Nr}   r   z;FeatureDistillationTask: Independent weights - task_weight=r   z:FeatureDistillationTask: Complementary mode - task_weight=z=FeatureDistillationTask: Default equal weights - task_weight=zCreating projection layer: z -> r   z.Feature dimensions match, no projection neededr   r
   r   r   rY   Fr!   rZ   z%FeatureDistillationTask: student_dim=z, teacher_dim=)r%   r&   rp   r   r   rl   r,   r   ru   rx   ry   r{   r)   r*   _detect_feature_dimr2   Linearr   r   r	   r5   r6   r   r7   r8   )r:   rm   rt   ru   rv   rx   ry   r   r   r   r   r{   rl   r   ro   rY   rZ   r=   s                    r>   r&    FeatureDistillationTask.__init__  s    	WE ##KKJJ
 &/&;ATATAV *':$8H8T$4Z]D!||##'#8#8"99JK^J_a )$4!'*-='=D$||##3"44EdF^F^E_a (+D$$'D!||##'#8#8"99J4KcKcJdf &"&":":="I&"&":":7=="I 
5||12E1FdK^J_` #6TXT_T_gkgqgqrJ||MN B= ] )7||,,V4;;**
 $q"a
	 	
 ll,,U3;;**
 $q"a
	 	
 	^\eL]KEJ<<LL23>BUAVX r@   r2   rB   c                     [        U 5      n [        U S5      (       a  U R                  $ [        U S5      (       a  U R                  $ [	        S5      e)z)Auto-detect feature dimension from model.head_hidden_sizenum_featureszCannot auto-detect feature dimension. Model must have 'head_hidden_size' or 'num_features' attribute, or you must specify student_feature_dim and teacher_feature_dim explicitly.)r	   r3   r   r   rH   )r2   s    r>   r   +FeatureDistillationTask._detect_feature_dim\  sT     U#5,--)))UN++%%%R r@   r   c                     SSK Jn  U R                  R                  5        H
  nSUl        M     U" U R
                  4SU0UD6U l        U $ )af  Prepare task for distributed training.

Wraps the trainable module (student + projection) in DistributedDataParallel
(DDP) while leaving the frozen teacher model unwrapped.

Args:
    device_ids: List of device IDs for DDP (e.g., [local_rank])
    **ddp_kwargs: Additional arguments passed to DistributedDataParallel

Returns:
    self (for method chaining)
r   r   Fr   r   r   s        r>   r   +FeatureDistillationTask.prepare_distributedl  r   r@   rL   rM   c                     [         R                  " U R                  R                  4XS.UD6nX@l        [         R                  " U R                  4UUS.UD6U l        U R
                  R                  " SUUSSS.UD6  U$ )z9Compile feature-distillation train and eval entry points.rQ   FTr   r$   )r5   rR   r   r   r   rl   )r:   rL   rM   rS   r   s        r>   rR   FeatureDistillationTask.compile  s     ]]4#8#8#@#@o'o`no
$ %!!!
!
 	!
 	 	
!		

 	
 r@   moduleemac                    > [         TU ]  XS9nUnUbF  [        U[        5      (       a  UR                  $ [        US5      (       d   U$ UR                  nUb  MF  U$ )N)r   r   r   )r%   get_eval_modelr'   r   r   r3   r   )r:   r   r   resolvedcurrentr=   s        r>   r   &FeatureDistillationTask.get_eval_model  sg    7))A!'#EFF&7H-- nnG ! r@   c                     Ub  UOU R                  US9nUc  0 $ [        U5      nUR                  c  0 $ SUR                  R                  5       0$ )Nr   r   )get_trainable_moduler	   r   
state_dict)r:   r   r   	trainables       r>   get_task_state&FeatureDistillationTask.get_task_state  s^    
 "-43L3LQT3L3U>I (	'Ii22==?@@r@   exclude_headc                     [        U R                  5      n[        UR                  R	                  5       5      nU(       a  US S nUR
                  b)  UR                  UR
                  R	                  5       5        U$ )N)r	   r   r   r   r   r   extend)r:   r   r   r   s       r>   get_clip_parameters+FeatureDistillationTask.get_clip_parameters  sh     !6!67	)++6689
#CRJ+i22==?@r@   statestrictc                 .   U(       d  g Ub  UOU R                  US9nUc  U(       a  [        S5      e[        S5      e[        U5      nUR                  S5      nUc  g UR                  c  U(       a  [        S5      eg UR                  R                  XbS9  g )Nr   z.Cannot load EMA task state before setup_ema().z2Cannot load task state without a trainable module.r   z@Checkpoint has projection task state but task has no projection.)r   )r   RuntimeErrorr	   r4   r   load_state_dict)r:   r   r   r   r   r   projection_states          r>   load_task_state'FeatureDistillationTask.load_task_state  s     !-43L3LQT3L3U>"#STTSTT (	 99\2#'"#eff,,-=,Mr@   rA   r   c                    U R                  U5      u  p4U R                  X25      n[        R                  " 5          U R                  R                  XR                  U R                  5      nU R	                  UR                  5       SS9nSSS5        [        R                  " UW5      nU R                  U-  U R                  U-  -   n	U	UUUS.$ ! , (       d  f       NK= f)au  Forward pass with feature distillation.

Args:
    input: Input tensor [B, C, H, W]
    target: Target labels [B]

Returns:
    Dictionary containing:
        - 'loss': Combined training loss (task + distillation)
        - 'output': Student logits (for metrics)
        - 'task_loss': Classification loss component
        - 'kd_loss': Feature distillation loss component
Tr   Nr   )r   ru   r5   r   rl   r]   rY   rZ   r   r   mse_lossry   rx   )
r:   rA   r   r   r   r   r   teacher_featuresr   r   s
             r>   rW   FeatureDistillationTask.forward  s    $ ,0+@+@+G(NN>:	]]_||33E;L;LdN^N^_H#||HOO,=t|T  **-/?@**Y69Q9QT[9[[
 $"	
 	
 _s   AC
C)ru   rx   r   ry   rl   r   )	NNNNNNNNTr   r   )NFra   )TNF) r0   rb   rc   rd   re   r,   r-   r   r(   r   r   r   rf   r5   r   r   rh   r&   staticmethodr   r   r   rR   r   r   rg   r   r   r   rW   ri   rj   rk   s   @r>   r   r     s    L .25937041515-1+/ `99` !bii1D!DE`  		*	`
 &.c]` "*%` 'uo` "*#` "*#` U\\*` EKK(` ` `D 299   " *.  
#	6 &"& 3-
 
0	Xbii%8 	d 	W_`b`i`iWj 	 	 +/ARYY'A A 
c4U\\)**	+	A   *.NDd3+<&=!=>?N N RYY'	N
 N 
N0!
<<!
 LL!
 
c5<<	 	!
 !
r@   r   ) re   loggingtypingr   r   r   r   r5   torch.nnr,   torch.nn.functional
functionalr   timm.modelsr   r   
timm.utilsr	   taskr   	getLoggerr0   r)   r-   r   r(   r   r   rp   rr   r   r   r$   r@   r>   <module>r     s    ;  / /     6 #  

H
%OQ")) OQd"sBII'::;"yy" "#" &	"
 $" "JK
L K
\=0 =0@`
l `
r@   