
    3j                     ^   S r SSKrSSKrSSKJr  SSKJr  SSKJr  SSK	J
r
JrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  SSKJrJrJrJ r J!r!  SSK"J#r#J$r$J%r%  \!RL                  " \'5      r( " S S\RR                  5      r* " S S\RR                  5      r+ " S S\RR                  5      r, " S S\RR                  5      r- " S S\5      r. " S S\RR                  5      r/\ " S S\5      5       r0\ " S S\05      5       r1 " S  S!\RR                  5      r2 " S" S#\RR                  5      r3 " S$ S%\RR                  5      r4 " S& S'\RR                  5      r5 " S( S)\RR                  5      r6 " S* S+\5      r7\" S,S-9 " S. S/\05      5       r8\" S0S-9 " S1 S2\0\5      5       r9/ S3Qr:g)4zPix2Struct modeling file    N)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torchdynamo_compilinglogging   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfigc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Pix2StructLayerNorm3   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zS
Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      l/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr"   Pix2StructLayerNorm.__init__4   s/     	ll5::k#:; #    c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   T)keepdim)tor$   float32powmeanrsqrtr'   r&   dtypefloat16bfloat16)r(   hidden_statesvariances      r,   forwardPix2StructLayerNorm.forward<   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r.   )r'   r&   )gư>__name__
__module____qualname____firstlineno__r"   r=   __static_attributes____classcell__r+   s   @r,   r   r   3   s    $+ +r.   r   c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
Pix2StructVisionEmbeddingsL   a  
Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
is represented by a vector of `hidden_size` values.
configreturnNc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l
        [        R                  " UR                  5      U l        g N)r!   r"   r   Linearpatch_embed_hidden_sizer)   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr(   rJ   r+   s     r,   r"   #Pix2StructVisionEmbeddings.__init__S   s}     "		&*H*H&J\J\ ]LL9K9KL!||FNNF<N<NOzz&"5"56r.   flattened_patchesc                     US S 2S S 2S4   R                  5       nUS S 2S S 2S4   R                  5       nUS S 2S S 2SS 24   nU R                  U5      nU R                  U5      nU R                  U5      nXE-   U-   nU R	                  U5      nU$ )Nr   r   r0   )longrP   rS   rT   rW   )r(   rZ   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingss          r,   r=   "Pix2StructVisionEmbeddings.forward\   s     (1a0557'1a0557-aABh7**+<=
**;7--k:  0>A
\\*-
r.   )rT   rW   rP   rS   )r@   rA   rB   rC   __doc__r   r"   r$   Tensorr=   rD   rE   rF   s   @r,   rH   rH   L   s<    7/ 7D 7 %,,  r.   rH   c                   8   ^  \ rS rSrU 4S jr   SS jrSrU =r$ )Pix2StructVisionAttentionp   c                 l  > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  U l        U R                  U R                  -  U l	        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        SU l        g NFbias)r!   r"   r)   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutrW   	inner_dimr   rN   querykeyvalueoutputgradient_checkpointingrX   s     r,   r"   "Pix2StructVisionAttention.__init__q   s    !--"(++11//(?(??YYt//eL
99T--t~~EJYYt//eL
ii0@0@uM&+#r.   c                 @  ^ ^ UR                   SS u  mnUU 4S jnU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      n	[        R
                  " XxR                  SS5      5      n
UGc  [        R                  " ST R                  XU4U
R                  U
R                  S9nT R                  (       a  T R                  (       a  SUl        UR                  5       S:X  a)  X2SS2SSSS24   R                  UR                  5      -   nOyUb  X2R                  UR                  5      -   nOX[!        5       (       dI  [        R"                  " TU4UR                  UR                  S9nX2R                  UR                  5      -   nSU-
  nUR%                  US:H  [        R&                  " U
R                  5      R(                  5      nX-  n
[        R*                  " U
[        R,                  " [        R&                  " U
R                  5      R(                  5      5      n
[.        R0                  R3                  U
S[        R4                  S	9R7                  U
5      n[.        R0                  R9                  UT R8                  T R                  S
9n[        R
                  " X5      nUR                  SS5      R;                  5       R=                  TST R>                  5      nT RA                  U5      nU4U4-   nU(       a  X4-   nU$ )z
Self-attention block
Nr0   c                    > U R                  5       R                  TSTR                  TR                  5      R	                  SS5      $ )
projectionr1   r   r0   )
contiguousviewro   rm   	transpose)states
batch_sizer(   s    r,   to_projection_shape>Pix2StructVisionAttention.forward.<locals>.to_projection_shape   s<    $$&++JDLL$JaJabllmnpqrrr.   r   r   devicer8   Tr1   )dimr8   ptraining)!shaperr   rs   rt   r$   matmulr}   zerosro   r   r8   rv   r   requires_gradr   r3   r   r%   masked_fillfinfominmaxtensorr   
functionalsoftmaxr4   type_asrW   r{   r|   rq   ru   )r(   r;   attention_maskposition_biasoutput_attentions
seq_lengthr   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsr   s   `              @r,   r=   !Pix2StructVisionAttention.forward   s    "/!4!4Ra!8
J	s +4::m+DE )-)@A
*4::m+DE l,@,@A,FG !KKDLL*9&--W]WcWcM **t}}.2+!!#q( -q$a?O0P0S0STaThTh0i i+ -0A0A-BVBV0W W-//!&,]5I5IQ^QdQd" !.0A0A-BVBV0W W-M,88!9KU[[Y_YeYeMfMjMjk&65<<FLL0I0M0M#NO }},,V5==,QYYZ`a }},,\T\\TXTaTa,bll<> "++Aq1<<>CCJPRTXTbTbckk+..M#33/Gr.   )
rW   rv   r)   rq   rs   rm   ro   ru   rr   rt   )NNFr?   rF   s   @r,   rf   rf   p   s    ,$ G Gr.   rf   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Pix2StructVisionMlp   rJ   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g ri   r!   r"   r   rN   r)   d_ffwi_0wi_1worU   rV   rW   r   dense_act_fnactrX   s     r,   r"   Pix2StructVisionMlp.__init__       IIf00&++EJ	IIf00&++EJ	))FKK););%Hzz&"5"56&--.r.   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ rM   r   r   r   rW   
isinstancer   r&   r$   rd   r8   int8r3   r(   r;   hidden_geluhidden_linears       r,   r=   Pix2StructVisionMlp.forward       hhtyy78		-0#3]3 tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r.   r   rW   r   r   r   )	r@   rA   rB   rC   r   r"   r=   rD   rE   rF   s   @r,   r   r      s    /5 / r.   r   c                      ^  \ rS rSrS\SS4U 4S jjr  SS\R                  S\R                  S-  S\S\	\R                  \R                  4   \	\R                     -  4S	 jjr
S
rU =r$ )Pix2StructVisionLayer   rJ   rK   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr   r*   )r!   r"   chunk_size_feed_forwardseq_len_dimrf   	attentionr   mlpr   r)   layer_norm_epspre_mlp_layer_normpre_attention_layer_normrX   s     r,   r"   Pix2StructVisionLayer.__init__   ss    '-'E'E$26:&v."5f6H6HfNcNc"d(;F<N<NTZTiTi(j%r.   r;   r   r   c                     UnU R                  U5      nU R                  UUUS9nUS   nUSS  nXd-   nU R                  U5      nU R                  U5      U-   nU4U-   nU$ )N)r   r   r   r   )r   r   r   r   )	r(   r;   r   r   residualself_attention_outputsattention_outputr   layer_outputs	            r,   r=   Pix2StructVisionLayer.forward   s     ! 55mD!%)/ "0 "

 2!4(, )3 ..}=xx-=/G+r.   )r   r   r   r   r   r   NF)r@   rA   rB   rC   r   r"   r$   rd   booltupler=   rD   rE   rF   s   @r,   r   r      s    k/ kD k /3"'	|| t+  	
 
u||U\\)	*U5<<-@	@ r.   r   c                      ^  \ rS rSrS\SS4U 4S jjr    SS\R                  S\R                  S-  S\S	\S
\S\	\
-  4S jjrSrU =r$ )Pix2StructVisionEncoderi  rJ   rK   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
r!   r"   rJ   r   
ModuleListrangenum_hidden_layersr   layerrv   )r(   rJ   _r+   s      r,   r"    Pix2StructVisionEncoder.__init__  sT    ]]5QWQiQiKj#kKja$9&$AKj#kl
&+# $ls   A&r;   r   r   output_hidden_statesreturn_dictc                 $   U(       a  SOS nU(       a  SOS n[        U R                  5       H0  u  pU(       a  Xa4-   nU	" XU5      n
U
S   nU(       d  M(  XzS   4-   nM2     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )N r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frM   r   .0vs     r,   	<genexpr>2Pix2StructVisionEncoder.forward.<locals>.<genexpr>2  s     m$[q$[s   	last_hidden_stater;   
attentions)	enumerater   r   r   )r(   r;   r   r   r   r   all_hidden_statesall_self_attentionsilayer_modulelayer_outputss              r,   r=   Pix2StructVisionEncoder.forward  s     #7BD$5b4(4OA#$58H$H!(HYZM)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm++*
 	
r.   )rJ   rv   r   )NFFT)r@   rA   rB   rC   r   r"   r$   rd   r   r   r   r=   rD   rE   rF   s   @r,   r   r     s{    ,5 ,$ , /3"'%* 
||
 t+
  	

 #
 
 
	 
 
r.   r   c                   l    \ rS rSr% \\S'   SrSr\S 5       r	\
R                  " 5       S 5       rS rSrg	)
Pix2StructPreTrainedModeli:  rJ   )imagetextFc                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r$   r   r   r   )r(   r   
input_maskdummy_inputss       r,   r   &Pix2StructPreTrainedModel.dummy_inputsA  s6    LL.	\\*-
!*"&0

 r.   c                    U R                   R                  n[        U[        5      (       a%  [        R
                  " UR                  US-  5        g[        U[        5      (       GaN  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        R                  " UR                  R                  SX#S-  -  S9  [        UR                  S5      (       aA  UR                  R                  b*  [        R                   " UR                  R                  5        [        R                  " UR"                  R                  SX#S-  -  S9  [        UR"                  S5      (       aA  UR"                  R                  b*  [        R                   " UR"                  R                  5        [        R                  " UR$                  R                  SX$S-  -  S9  [        UR$                  S5      (       aC  UR$                  R                  b+  [        R                   " UR$                  R                  5        ggg[        U[&        5      (       Ga  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R(                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R*                  OU R                   R*                  n[        R                  " UR,                  R                  SX#U-  S-  -  S9  [        R                  " UR.                  R                  SX#S-  -  S9  [        R                  " UR0                  R                  SX#S-  -  S9  [        R                  " UR2                  R                  SX&U-  S-  -  S9  UR4                  (       a0  [        R                  " UR6                  R                  SX#S-  -  S9  gg[        U[8        R:                  5      (       a  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        R                  " UR                  SX#S-  -  S9  UR<                  bK  [?        UR                  SS5      (       d.  [        R                   " UR                  UR<                     5        ggg[        U[@        5      (       a  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        R                  " URB                  R                  SX#S-  -  S9  g[        U[8        RD                  [8        RF                  45      (       ac  [        RH                  " UR                  SU R                   RJ                  S9  UR                  b!  [        R                   " UR                  5        gg[        U[        5      (       a/  UR                  b!  [        RL                  " UR                  5        gg[        U[8        R:                  5      (       a  [        R                  " UR                  SU R                   RJ                  S9  UR<                  bK  [?        UR                  SS5      (       d.  [        R                   " UR                  UR<                     5        gggg)	zInitialize the weights      ?g        g      )r6   stdrk   N_is_hf_initializedF)'rJ   initializer_factorr   r   init	constant_r&    Pix2StructTextDenseGatedActDenser   text_configr)   r   normal_r   hasattrrk   zeros_r   r   Pix2StructTextAttentionrl   	num_headsrr   rs   rt   ru   has_relative_attention_biasrelative_attention_biasr   rQ   padding_idxgetattrPix2StructTextModellm_headrN   Conv2dtrunc_normal_initializer_rangeones_)r(   modulefactorr)   r   rm   ro   s          r,   _init_weights'Pix2StructPreTrainedModel._init_weightsL  sl    //f122NN6==&3,7 @AA dkk+;<< ''33[[,, 
 4>dkkK[3\3\4;;**//bfbmbmbrbrDLL++#6VZEZ;[\v{{F++0@0@0LFKK,,-LL++#6VZEZ;[\v{{F++0@0@0LFKK,,-LL))T>9RSvyy&))fiinn.HFIINN+ /I) 788 dkk+;<< ''33[[,,  1;4;;HX0Y0Y'',,_c_j_j_v_v 
 dkk+;<< ''11[[**  LL,,3FUgGglpFp<qrLL**&QUDU:VWLL,,3FSWFW<XYLL--CVRdHdimGm=no11V;;BBRXmq\qRrs 2-- dkk+;<< ''33[[,,  LLSfQU@U6VW!!-gfmmMach6i6iFMM&*<*<=> 7j- 344 dkk+;<< ''33[[,,  LL..SfY]H]>^_BII 677v}}3DKK<Y<YZ{{&FKK( ' 344}}(

6==) )--LLSdkk6S6ST!!-gfmmMach6i6iFMM&*<*<=> 7j- .r.   c                 :   U R                   R                  nU R                   R                  nUc  [        S5      eUR	                  UR
                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US:H  U5        U$ )Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information..r1   r   ).r   z1self.model.config.pad_token_id has to be defined.)rJ   decoder_start_token_idpad_token_id
ValueError	new_zerosr   clonemasked_fill_)r(   r   r  r  shifted_input_idss        r,   _shift_right&Pix2StructPreTrainedModel._shift_right  s    !%!C!C{{//!)< 
 &//	@%.sCRCx%8%>%>%@#qr'"$:&!PQQ&&'8D'@,O  r.   r   N)r@   rA   rB   rC   r   __annotations__input_modalities_can_compile_fullgraphpropertyr   r$   no_gradr  r  rD   r   r.   r,   r   r   :  sH    ("  ]]_I? I?X!r.   r   c                      ^  \ rS rSr% \\S'   SrSrSrS/r	S\4U 4S jjr
S r\     SS\R                  S	-  S
\R                  S	-  S\S	-  S\S	-  S\S	-  S\\-  4S jj5       rSrU =r$ )Pix2StructVisionModeli  rJ   rZ   )r   Tr   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  S9U l
        U R                  5         g Nr   )r!   r"   rJ   rH   r_   r   encoderr   r)   r   	layernorm	post_initrX   s     r,   r"   Pix2StructVisionModel.__init__  sS     4V<.v6,V-?-?VEZEZ[ 	r.   c                 .    U R                   R                  $ rM   )r_   rP   r(   s    r,   get_input_embeddings*Pix2StructVisionModel.get_input_embeddings  s    ///r.   Nr   r   r   r   rK   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUc   UR                  SS9S:g  R                  5       nU R                  U5      nU R                  UUUUUS9nUS   n	U R                  U	5      n	U(       d
  U	4n
XSS -   $ [        U	UR                  UR                  S9$ )	a\  
flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
    Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
    [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
    paper](https://huggingface.co/papers/2210.03347) (figure 5) for more details.

Example:

```python
>>> import httpx
>>> from io import BytesIO
>>> from PIL import Image
>>> from transformers import AutoProcessor, Pix2StructVisionModel

>>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 2048, 768]
```
Nz%You have to specify flattened_patchesr1   r   r   )r   r   r   r   r   r   )rJ   r   r   r   r  sumfloatr_   r&  r'  r   r;   r   )r(   rZ   r   r   r   r   kwargsembedding_outputencoder_outputssequence_outputhead_outputss              r,   r=   Pix2StructVisionModel.forward  s   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY$DEE!/333;q@GGIN??+<=,,)/!5# ' 
 *!,..9+-L!""555-)77&11
 	
r.   )rJ   r_   r&  r'  )NNNNN)r@   rA   rB   rC   r   r  main_input_namer  supports_gradient_checkpointing_no_split_modulesr"   r,  r   r$   rd   r   r   r   r=   rD   rE   rF   s   @r,   r#  r#    s    "")O!&*#01
5 
0  26.2)-,0#'H
 <<$.H
 t+H
  $;	H

 #TkH
 D[H
 
+	+H
 H
r.   r#  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )r   i  rJ   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g ri   r   rX   s     r,   r"   )Pix2StructTextDenseGatedActDense.__init__  r   r.   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ rM   r   r   s       r,   r=   (Pix2StructTextDenseGatedActDense.forward  r   r.   r   	r@   rA   rB   rC   r   r"   r=   rD   rE   rF   s   @r,   r   r     s    /3 / r.   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Pix2StructTextLayerFFi0  rJ   c                    > [         TU ]  5         [        U5      U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r%  )r!   r"   r   DenseReluDenser   r)   layer_norm_epsilon
layer_normr   rU   rV   rW   rX   s     r,   r"   Pix2StructTextLayerFF.__init__1  sK    >vF-f.@.@fF_F_`zz&"5"56r.   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ rM   )rF  rD  rW   )r(   r;   forwarded_statess      r,   r=   Pix2StructTextLayerFF.forward9  s;    ??=9../?@%5E(FFr.   )rD  rW   rF  r@  rF   s   @r,   rB  rB  0  s    73 7 r.   rB  c                   p   ^  \ rS rSrS
S\S\S-  4U 4S jjjr\SS j5       rSS jr	     SS jr
S	rU =r$ )r  i@  NrJ   	layer_idxc                   > [         TU ]  5         X l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l	        UR                  U l        U R                  U R                  -  U l        X0l        Uc-  [        R                  SU R                   R"                   S35        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        U R                  (       a0  [$        R0                  " U R                  U R                  5      U l        SU l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Frj   )r!   r"   r  relative_attention_num_bucketsrelative_attention_max_distancer)   rl   rm   r  ro   rV   rW   rq   rL  loggerwarning_oncer+   r@   r   rN   rr   rs   rt   ru   rQ   r  rv   r(   rJ   r  rL  r+   s       r,   r"    Pix2StructTextAttention.__init__A  sc   +F(.4.S.S+/5/U/U,!--"(++''**(?(??"*4>>+B+B*C D, , YYt//1A1AN
99T--t/?/?eLYYt//1A1AN
ii 0 0$2B2BO+++-<<8[8[]a]i]i+jD(&+#r.   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ )aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   r0   r   )r3   r$   r\   absr   
zeros_likelogr1  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r,   _relative_position_bucket1Pix2StructTextAttention._relative_position_bucket]  s   . AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r.   c                    Uc   U R                   R                  R                  n[        R                  " U[        R
                  US9SS2S4   U-   n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  USU R                  U R                  S9nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )z%Compute binned relative position biasN)r8   r   F)r\  r]  r^  )r0   r   r   r   )r  r&   r   r$   aranger\   rc  rN  rO  permute	unsqueeze)
r(   query_length
key_lengthr   past_seen_tokenscontext_positionmemory_positionr[  relative_position_bucketvaluess
             r,   compute_bias$Pix2StructTextAttention.compute_bias  s    >1188??F <<EJJvVWXZ^W^_brr,,zFSTXZ[T[\+>#'#A#A;;==	 $B $
  --.FG	*44Q7r.   c                 N   UR                   SS u  pUb  UR                  U R                  5      OSn
[        U
[        R
                  5      (       a  U
R                  5       OU
n
USLnU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUb[  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                   nOUnU(       a  UOUnU(       aU  U(       aN  W(       aG  UR"                  U R                     R$                  nUR"                  U R                     R&                  nOU R)                  U5      nU R+                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUb@  UR-                  UUU R                  5      u  nnU(       a  SUR                  U R                  '   [        R.                  " UUR                  SS5      5      nUc  UR                   S   nU R0                  (       db  [        R2                  " SU R                  U	U4UR4                  UR6                  S	9nU R8                  (       a  U R:                  (       a  SUl        OU R?                  U	UUR4                  U
S
9nUb#  USS2SS2SS2SUR                   S   24   nUU-   nUnUU-  n[@        RB                  RE                  URG                  5       SS9RI                  U5      n[@        RB                  RK                  UU RJ                  U R:                  S9n[        R.                  " UU5      nUR                  SS5      RM                  5       nUR                  USU RN                  5      nU RQ                  U5      nUU4nU(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
Nr0   r   r1   r   Tr   r   )r   rk  r/  r   ))r   get_seq_lengthrL  r   r$   rd   r  rr   r|   ro   rm   r}   r	   
is_updatedgetcross_attention_cacheself_attention_cachelayerskeysro  rs   rt   updater   r  r   r   r8   rv   r   r   rp  r   r   r   r1  r   rW   r{   rq   ru   )r(   r;   maskkey_value_statesr   past_key_valuesr   r2  r   r   rk  is_cross_attentionr   ru  curr_past_key_valuescurrent_statesr   r   r   rj  causal_maskr   r   r   r   s                            r,   r=   Pix2StructTextAttention.forward  s    "/!4!4Ra!8
M\Mh?99$..Ino7ABRTYT`T`7a7a+113gw .T9zz-0#((RtG^G^_iijkmno &:oGZ+[+[(3377GJ!'6'L'L$'6'K'K$#2 -?)]/j-44T^^DIIJ/66t~~FMML.1J::n5L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL*+?+F+FzS_aeaoao+p(
L%AEO..t~~> lJ,@,@A,FG #))"-J33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1
6==Sc !2 ! "1a,Bj.>.>r.B,B#BC - ;,&& }},,V\\^,DLLVT}},,\T\\TXTaTa,bll<>!++Aq1<<>!&&z2t~~Fkk+../Gr.   )rW   rv   r  r)   rq   rs   rm   rL  ro   ru   rr   r  rO  rN  rt   FN)T       )Nr   )NNNNF)r@   rA   rB   rC   r   intr"   staticmethodrc  rp  r=   rD   rE   rF   s   @r,   r  r  @  sZ    ,3 ,cfimcm , ,8 -  - `* [ [r.   r  c                   N   ^  \ rS rSrSS\S-  4U 4S jjjr     SS jrSrU =r$ )	 Pix2StructTextLayerSelfAttentioni   NrL  c                    > [         TU ]  5         [        XUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr  rL  r   r!   r"   r  r   r   r)   rE  rF  r   rU   rV   rW   rR  s       r,   r"   )Pix2StructTextLayerSelfAttention.__init__  sR    0W`
 .f.@.@fF_F_`zz&"5"56r.   c           	          U R                  U5      nU R                  UUUUUUS9n	XR                  U	S   5      -   nU4U	SS  -   n
U
$ )N)r|  r   r~  	use_cacher   r   r   rF  r   rW   )r(   r;   r   r   r~  r  r   r2  normed_hidden_statesr   r   s              r,   r=   (Pix2StructTextLayerSelfAttention.forward	  sl      $}=>> '+/ * 
 &5Ea5H(II "%5ab%99r.   r   rW   rF  r  )NNNFF	r@   rA   rB   rC   r  r"   r=   rD   rE   rF   s   @r,   r  r     s4    7SSWZ 7 7  r.   r  c                   L   ^  \ rS rSrSS\S-  4U 4S jjjr    SS jrSrU =r$ )	!Pix2StructTextLayerCrossAttentioni"  NrL  c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr  r   r  )r(   rJ   rL  r+   s      r,   r"   *Pix2StructTextLayerCrossAttention.__init__#  sP    0UZfop-f.@.@fF_F_`zz&"5"56r.   c           	          U R                  U5      nU R                  UUUUUUS9n	XR                  U	S   5      -   n
U
4U	SS  -   nU$ )N)r|  r}  r   r~  r   r   r   r  )r(   r;   r}  r   r   r~  r   r2  r  r   r   r   s               r,   r=   )Pix2StructTextLayerCrossAttention.forward)  sk      $}=>> -'+/ * 
 %||4DQ4G'HH/$4QR$88r.   r  rM   )NNNFr  rF   s   @r,   r  r  "  s0    7#* 7 7  r.   r  c                   V   ^  \ rS rSrSS\S-  4U 4S jjjr         SS jrSrU =r$ )	Pix2StructTextBlockiA  NrL  c                    > [         TU ]  5         [        UUUS9U l        [	        UUS9U l        [        U5      U l        g )Nr  )rL  )r!   r"   r  self_attentionr  encoder_decoder_attentionrB  r   rR  s       r,   r"   Pix2StructTextBlock.__init__B  sH    >(C
 *K*
&
 )0r.   c           	         U R                  UUUUUU	S9nUS   nUSS  nUR                  [        R                  :X  al  [        R                  " U5      R                  5       (       aC  [        R                  " UR                  5      R                  S-
  n[        R                  " X* US9nUS LnU(       a  U R                  UUUUUU	S9nUS   nUR                  [        R                  :X  al  [        R                  " U5      R                  5       (       aC  [        R                  " UR                  5      R                  S-
  n[        R                  " X* US9nUUSS  -   nU R                  U5      nUR                  [        R                  :X  al  [        R                  " U5      R                  5       (       aC  [        R                  " UR                  5      R                  S-
  n[        R                  " X* US9nU4nUU-   $ )N)r   r   r~  r  r   r   r   i  )r   r   )r}  r   r   r~  r   )r  r8   r$   r9   isinfanyr   r   clampr  r   )r(   r;   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr~  r  r   r   r2  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                     r,   r=   Pix2StructTextBlock.forwardR  s    "&!4!4)'+/ "5 "
 /q12126 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KK<[YM2$>&*&D&D!65; /"3 'E '# 4A6M ""emm3M8R8V8V8X8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O / %--/EKK4N4R4R4T4T++m&9&9:>>EK!KK<[YM "***r.   )r  r   r  r  )	NNNNNNFFTr  rF   s   @r,   r  r  A  s@    1SSWZ 1 1& "#&*<+ <+r.   r  z3
    The standalone text decoder of Pix2Struct
    )custom_introc                     ^  \ rS rSr% \\S'   SrS/rSS0rSr	U 4S jr
S	 r\           SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\S
-  S\S
-  S\S
-  S\R                  S
-  S\S
-  S\\R                  S4   \-  4S jj5       rSrU =r$ )r  i  rJ   )r   r  zlm_head.weightzembed_tokens.weightTc                 R  > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        U[        US:H  5      US9PM     sn5      U l        [        UR
                  UR                  S9U l        [        R                   " UR"                  5      U l        [        R&                  " UR
                  UR                  SS9U l        U R+                  5         SU l        g s  snf )Nr   r  r   Frj   )r!   r"   r   rQ   
vocab_sizer)   embed_tokensr   r   
num_layersr  r   r   r   rE  final_layer_normrU   rV   rW   rN   r	  r(  rv   )r(   rJ   r   r+   s      r,   r"   Pix2StructTextModel.__init__  s     LL):):F<N<NO]] v0011A $FQRSV`ab1

 !4F4F4FFLeLe fzz&"5"56yy!3!3V5F5FUS 	&+#s   (!D$c                     Xl         g rM   )r  r(   new_embeddingss     r,   set_input_embeddings(Pix2StructTextModel.set_input_embeddings  s    *r.   Nr   r   r  r  inputs_embedsr~  r  r   r   labelsr   rK   .c                 0   Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUb  Ub  [        S5      eUb&  UR                  5       nUR                  SUS   5      nO"Ub  UR                  5       SS nO[        S5      eUc%  U R                  c   S5       eU R                  U5      nUu  pU(       a`  Uc]  U R                   R                  (       a/  [        [        U R                   S9[        U R                   S95      nO[        U R                   S9nUc8  Ub  UR!                  5       U-   OUn["        R$                  " UUUR&                  S	9nU R                   R(                  (       a  [+        U R                   UUUS
9nOVUSS2SSSS24   nUR-                  UR.                  S9nSU-
  ["        R0                  " UR.                  5      R2                  -  nUb  [5        U R                   UUUS9nU	(       a  SOSnU(       a  SOSnU(       a  SOSnSnSnU R7                  U5      n[9        U R:                  5       H\  u  nnU	(       a  UU4-   nU" UUUUUUUUUS9	nUS   nUS   nUb  UU(       a  SOS   nU(       d  ME  UUS   4-   nUc  MS  UUS   4-   nM^     U R=                  U5      nU R7                  U5      nU R?                  U5      nU	(       a  UU4-   nSnU
b  U
R-                  UR&                  5      n
[@        RB                  " SSS9nU" URE                  5       R                  SUR                  S5      5      U
RE                  5       R                  S5      5      nU(       d  [G        S UUUUUU4 5       5      $ [I        UUUUUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
    embeddings so you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
    Training](./t5#training).

Example:

```python
>>> from transformers import AutoProcessor, Pix2StructTextModel

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

>>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> loss = outputs.loss
```
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer1   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddings)rJ   )r   )rJ   r  r   r~  )r8   r   )rJ   r  r   r  r   )r~  r  r   r   r   r   r0      r  r6   )ignore_index	reductionc              3   0   #    U  H  nUc  M  Uv   M     g 7frM   r   r   s     r,   r   .Pix2StructTextModel.forward.<locals>.<genexpr>U  s"      A  s   	)losslogitsr~  r;   r   cross_attentions)%rJ   r  r   r   r   rv   r   rP  warningr  sizer|   r  is_encoder_decoderr	   r   rt  r$   r%   r   
is_decoderr   r3   r8   r   r   r   rW   r   r   r  r	  r   CrossEntropyLossr{   r   r   )r(   r   r   r  r  r  r~  r  r   r   r  r   r2  input_shaper   r   mask_seq_lengthr  r   all_attentionsall_cross_attentionsr   r  r;   r   r   r   r  r  loss_fcts                                 r,   r=   Pix2StructTextModel.forward  s1   T "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY&&4==YNNl I ]%>stt"#..*K!r;r?;I&',,.s3Kdee $$0p2pp0 --i8M!,
0{{--"5 4l$++6V# #/dkk"B! BQA\..0:=bl  #ZZ
OML`L`aN;;!!,{{+- /	K )D$)9:K%..}/B/B.CK,M<O<O0P0T0TTK!-%>{{+5&;	&" #7BD0d&7rd(,%]3(4OA|#$58H$H!(%&- /#"3
M *!,M
 *!,M$00=CTaZ[0\-  !/=3C2E!E(4+?=QRCSBU+U(9  5< --m<]3m,   1]4D DYYv}}-F**OHF--/44RRI6K\K\K^KcKcdfKghD  #%"(   1++%1
 	
r.   )rW   r  r  rv   r   r	  )NNNNNNNNNNN)r@   rA   rB   rC   r   r  r  r:  _tied_weights_keysr9  r"   r  r   r$   
LongTensorFloatTensorr   r   r   r   r=   rD   rE   rF   s   @r,   r  r    s]    !  ./*,AB&*#,&+  .237:>;?15(,!%)-,0*.#'t
##d*t
 ))D0t
  %0047	t

 !& 1 1D 8t
 ''$.t
 t
 $;t
  $;t
 #Tkt
   4't
 D[t
 
u  #%	&)J	Jt
 t
r.   r  zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c                     ^  \ rS rSr% \\S'   SrS\4U 4S jjrS rS r	S\
R                  4S jrS	 r\            SS\R                   S
-  S\R                   S
-  S\R"                  S
-  S\R$                  S
-  S\\\R                         S
-  S\S
-  S\R"                  S
-  S\R*                  S
-  S\S
-  S\S
-  S\S
-  S\S
-  S\\R                      \-  4S jj5       rSrU =r$ )"Pix2StructForConditionalGenerationik  rJ   rZ   c                    > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        UR                  U l        U R                  5         g rM   )
r!   r"   r#  vision_configr&  r  r   decoderis_vqar(  rX   s     r,   r"   +Pix2StructForConditionalGeneration.__init__t  sK     ,V-A-AB*6+=+=>mm 	r.   c                 6    U R                   R                  5       $ rM   )r  r,  r+  s    r,   r,  7Pix2StructForConditionalGeneration.get_input_embeddings  s    ||0022r.   c                 :    U R                   R                  U5        g rM   )r  r  r  s     r,   r  7Pix2StructForConditionalGeneration.set_input_embeddings  s    )).9r.   rK   c                 6    U R                   R                  5       $ rM   )r  get_output_embeddingsr+  s    r,   r  8Pix2StructForConditionalGeneration.get_output_embeddings  s    ||1133r.   c                 :    U R                   R                  U5        g rM   )r  set_output_embeddingsr  s     r,   r  8Pix2StructForConditionalGeneration.set_output_embeddings  s    **>:r.   Nr   r   r   r4  r~  r  decoder_inputs_embedsr  r   r   r   c                 2   U	b  U	OU R                   R                  R                  n	Ub  UOU R                   R                  nUc  U R	                  UUU
UUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nUbX  UcU  UcR  U R                  U5      nUb  UO2UR                  U R                   R                  5      R                  5       nSUSS2S4'   U R                  UUUUUUU	U
UUUS9nU(       d  X-   $ [        UR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  UR"                  UR$                  S9	$ )	a  
flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
    Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
    `num_channels` * `patch_size` * `patch_size`

    The process of flattening the pixel patches is done by `Pix2StructProcessor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss for the decoder.

Example:

Inference:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> # autoregressive generation
>>> generated_ids = model.generate(**inputs, max_new_tokens=50)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_text)
A stop sign is on a street corner.

>>> # conditional generation
>>> text = "A picture of"
>>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

>>> generated_ids = model.generate(**inputs, max_new_tokens=50)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_text)
A picture of a stop sign with a red stop sign
```

Training:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
>>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> text = "A stop sign is on the street corner."

>>> inputs = processor(images=image, return_tensors="pt")
>>> labels = processor(text=text, return_tensors="pt").input_ids

>>> # forward pass
>>> outputs = model(**inputs, labels=labels)
>>> loss = outputs.loss
>>> print(f"{loss.item():.5f}")
5.94282
```N)rZ   r   r   r   r   r   r   r0   r   )r   r   r  r~  r  r  r  r   r   r  r   )	r  r  r~  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)rJ   r   r  r   r&  r   r   lenr  ner  r1  r  r   r  r  r~  r;   r   r  r   )r(   rZ   r   r   r   r4  r~  r  r  r  r   r   r   r2  r;   decoder_outputss                   r,   r=   *Pix2StructForConditionalGeneration.forward  s   N "+!6IDKK<S<S<]<]	%0%<k$++BYBY ""ll"3-"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 *5 '&))$++*B*BCIIK # ,-"1a4( ,,'1/+"/#1/!5# ' 
 "44 %%"))+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r.   )r  r&  r  )NNNNNNNNNNNN)r@   rA   rB   rC   r   r  r8  r"   r,  r  r   Moduler  r  r   r$   r  r  
BoolTensorr   r   rd   r   r   r=   rD   rE   rF   s   @r,   r  r  k  s    )O	/ 	3:4ryy 4;  7;3759:>BF(,*.59!%)-,0#'b
 ,,t3b
 ))D0b
 !++d2	b

 !& 0 04 7b
 uU%6%6784?b
 b
   4'b
  %||d2b
 $;b
  $;b
 #Tkb
 D[b
 
u  	!$6	6b
 b
r.   r  )r   r  r#  r  );rc   rX  r$   r    r   r   activationsr   cache_utilsr   r   r	   
generationr
   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_pix2structr   r   r   
get_loggerr@   rP  r  r   rH   rf   r   r   r   r   r#  r   rB  r  r  r  r  r  r  __all__r   r.   r,   <module>r     s       & ! C C ) J 9  .  e d 
		H	%+")) +2! !HW		 Wv")) :&6 &R&
bii &
R q! q! q!h _
5 _
 _
Fryy :BII  |bii |@ryy D		 >M+4 M+` 
R
3 R

R
j 
~
)BO ~

~
Br.   