
    3j1                     :   S r SSKrSSKJr  SSKrSSKJr  SSKJr  SSKJ	r
  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJrJr  SSKJr  SSKJr  SSK J!r!J"r"J#r#  SSK$J%r%  SSK&J'r'  \"RP                  " \)5      r* " S S\RV                  5      r, " S S\RV                  5      r- " S S\RV                  5      r. " S S\5      r/\! " S S\5      5       r0\! " S S\05      5       r1\!" S S!9 " S" S#\0\5      5       r2\!" S$S!9 " S% S&\05      5       r3/ S'Qr4g)(zPyTorch OpenAI ImageGPT model.    N)Any)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D)auto_docstringloggingtorch_float)maybe_autocast   )ImageGPTConfigc                   x   ^  \ rS rSrS	S\\   S\4U 4S jjjrS\R                  S\R                  4S jr
SrU =r$ )
ImageGPTLayerNorm0   hidden_sizeepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g N)super__init__r   r   	ParametertorchTensorweight)selfr   r   	__class__s      h/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/imagegpt/modeling_imagegpt.pyr#   ImageGPTLayerNorm.__init__1   s,    ll5<<#<=    tensorreturnc           	          U[         R                  " [         R                  " [         R                  " U5      SSS9U R                  -   5      -  nXR
                  -  nU$ )NT)axiskeepdim)r%   sqrtmeansquarer   r'   )r(   r-   s     r*   forwardImageGPTLayerNorm.forward6   sI    %**UZZV0D2W[%\_c_g_g%ghh++%r,   )r   r'   )gh㈵>)__name__
__module____qualname____firstlineno__tupleintfloatr#   r%   r&   r6   __static_attributes____classcell__r)   s   @r*   r   r   0   s?    >E#J >U > >
ell u||  r,   r   c                     ^  \ rS rSrSS\S-  S\S-  4U 4S jjjrSS jrSS jrS r	S	 r
      SS
\R                  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\4S jjrSrU =r$ )ImageGPTAttention=   Nis_cross_attention	layer_idxc           
        > [         TU ]  5         Xl        UR                  nU R	                  S[
        R                  " [
        R                  " XD4[
        R                  S95      R                  SSXD5      SS9  UR                  U l        UR                  U l        U R                  U R                  -  U l        U R                  U l        U R                  U R                  -  U R                  :w  a&  [!        SU R                   SU R                   S35      eUR"                  U l        X l        UR&                  U l        X0l        UR*                  U l        U R$                  (       aN  [-        S	U R                  -  U R                  5      U l        [-        U R                  U R                  5      U l        O([-        S
U R                  -  U R                  5      U l        [-        U R                  U R                  5      U l        [4        R6                  " UR8                  5      U l        [4        R6                  " UR<                  5      U l        g )Nbiasdtyper   F)
persistentz=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).   r   ) r"   r#   configmax_position_embeddingsregister_bufferr%   trilonesboolviewr   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsrE   scale_attn_by_inverse_layer_idxrF   reorder_and_upcast_attnr   c_attnq_attnc_projr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout)r(   rM   rE   rF   max_positionsr)   s        r*   r#   ImageGPTAttention.__init__>   s   66JJuzz="@

STYY1m  	 	
  ++33$..8..==4>>)T^^;OPTP^P^O_ `NN#2' 
 #)";";"4 06/U/U,"'-'E'E$"" T^^!3T^^DDK @DK T^^!3T^^DDKT^^T^^<JJv'8'89ZZ(:(:;r,   c                 F   [         R                  " XR                  SS5      5      nU R                  (       a   U[	        UR                  S5      S-  5      -  nU R                  (       a  U[        U R                  S-   5      -  nU R                  (       d  UR                  S5      UR                  S5      pvU R                  S S 2S S 2Xv-
  U2S U24   n[         R                  " UR                  5      R                  n	[         R                  " XR                  UR                  S9n	[         R                   " XU	5      nUb  XT-   n["        R$                  " SS9" U5      nUR'                  UR                  5      nU R)                  U5      n[         R                  " XS5      n
X4$ )Nr0         ?r   rJ   devicedim)r%   matmul	transposerZ   r   sizer[   r>   rF   rE   rH   finforJ   minr-   rk   wherer   Softmaxtyperb   )r(   querykeyvalueattention_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputs              r*   _attnImageGPTAttention._attnf   sP   ||E==R+@A""'+ejjn6K*LLL //'%0B*CCL&&',zz"~sxx|*))Aq**Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{*ML%'8Lzzb),7 $((5((6ll<7((r,   c           	      l   UR                  5       u  pVpxUR                  5       u    pn	[        R                  " XV-  Xz[        R                  UR                  S9nSnU R
                  (       a   U[        UR                  S5      5      S-  -  nU R                  (       a  U[        U R                  S-   5      -  n[        UR                  R                  SS9   UR                  SXx5      UR                  SS5      R                  SX5      p[        R                  " XR                  5       UR                  5       S	US
9nUR                  XVXz5      nS S S 5        U R                  (       d  UR                  S5      UR                  S5      nnU R                  S S 2S S 2UU-
  U2S U24   n[        R                   " UR"                  5      R$                  n[        R&                  " UUR"                  UR                  S9n[        R(                  " UUU5      nUb  X-   n[*        R,                  " SS9" U5      nUR"                  [        R                  :w  a  [/        S5      eUR                  UR"                  5      nU R1                  U5      n[        R2                  " X5      nUU4$ ! , (       d  f       GN\= f)Nrj   g      ?r0   ri   r   F)enabledrh   r   )betaalpharl   zDError with upcasting, attn_weights does not have dtype torch.float32)rp   r%   emptyfloat32rk   rZ   r>   r[   rF   r   ru   reshapero   baddbmmrE   rH   rq   rJ   rr   r-   rs   r   rt   RuntimeErrorrb   rn   )r(   rv   rw   rx   ry   bszrV   	q_seq_lendk_	k_seq_lenrz   scale_factorqkr{   r|   r}   r~   r   s                       r*   _upcast_and_reordered_attn,ImageGPTAttention._upcast_and_reordered_attn   s2   (-

%	 XXZ1 {{3?IPUP]P]fkfrfrs ""E%**R.1S88L//E$..1"455L ELL--u===Y3S]]2r5J5R5RSUWY5eq ==wwy!'')RS[ghL'//	UL >
 &&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'8Lzzb),7 .eff#((5((6ll<7L((; >=s   A9J$$
J3c                 v    UR                  5       SS X#4-   nUR                  " U6 nUR                  SSSS5      $ )z:
Splits hidden_size dim into attn_head_size and num_heads
Nr0   r   rL   r   r   )rp   rS   permuter(   r-   rV   attn_head_size	new_shapes        r*   _split_headsImageGPTAttention._split_heads   sA     KKM#2&))DD	i(~~aAq))r,   c                     UR                  SSSS5      R                  5       nUR                  5       SS X#-  4-   nUR                  U5      $ )zC
Merges attn_head_size dim and num_attn_heads dim into hidden_size
r   rL   r   r   Nrh   )r   
contiguousrp   rS   r   s        r*   _merge_headsImageGPTAttention._merge_heads   sM     1a+668KKM#2&)*D)FF	{{9%%r,   hidden_states
layer_pastry   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionsr.   c                    US Ln	UR                   u  pnUb]  [        U[        5      (       aF  UR                  R	                  U R
                  5      nU	(       a  UR                  nOUR                  nOUnU	(       a  UOUnU	(       Ga-  [        U S5      (       d  [        S5      eUb`  W(       aY  U R                  U5      nWR                  U R
                     R                  nUR                  U R
                     R                  nGOKU R                  U5      nU R                  U5      R                  U R                   SS9u  nnUR#                  U
SU R$                  U R&                  5      R)                  SS5      nUR#                  U
SU R$                  U R&                  5      R)                  SS5      nOU R                  U5      R                  U R                   SS9u  nnnUR#                  U
SU R$                  U R&                  5      R)                  SS5      nUR#                  U
SU R$                  U R&                  5      R)                  SS5      nUb@  WR+                  UUU R
                  5      u  nnU	(       a  SUR                  U R
                  '   UR#                  XU R$                  U R&                  5      R)                  SS5      nU R,                  (       a  U R/                  UUUU5      u  nnOU R1                  UUUU5      u  nnU R3                  UU R$                  U R&                  5      nU R5                  U5      nU R7                  U5      nUU4$ )Nr^   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.rL   rl   r0   r   T)shape
isinstancer   
is_updatedgetrF   cross_attention_cacheself_attention_cachehasattrrY   r^   layerskeysvaluesr]   splitrX   rS   rV   rW   ro   updater\   r   r   r   r_   rd   )r(   r   r   ry   r   r   r   r   kwargsrE   r   seq_lenr   r   curr_past_key_valuescurrent_statesrv   rw   rx   r   rz   s                        r*   r6   ImageGPTAttention.forward   s    3$>'--a!*&9::'2266t~~F
%+5+K+K(+5+J+J('1$2D.-4** t 
 %*M2*11$..AFF,33DNNCJJM2![[8>>tTU>V
UhhsBFPPQRTUV

3DNNDMMJTTUVXYZ $N ; A A$//WX A YE3((3DNNDMMBLLQPQRCJJsBFPPQRTUVE!-44S%PJC!8<
%%dnn5

3GQQRSUVW''(,(G(GsTY[i(j%K(,

5#un(U%K''T^^T]]Skk+.((5L((r,   )rb   r]   r_   rM   rT   rW   rE   rF   rV   r^   r\   rd   r[   rZ   rX   )FNr!   NNNNFF)r8   r9   r:   r;   rR   r=   r#   r   r   r   r   r%   r&   r	   r<   r6   r?   r@   rA   s   @r*   rC   rC   =   s    &<4$; &<SVY]S] &< &<P )D.)`*& $(.2596:!&).B)||B) DLB) t+	B)
  %||d2B) !&t 3B) $;B)  $;B) 
B) B)r,   rC   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ImageGPTMLPi  c                    > [         TU ]  5         UR                  n[        X5      U l        [        X15      U l        [        UR                     U l        [        R                  " UR                  5      U l        g r!   )r"   r#   r   r   c_fcr_   r   activation_functionactr   r`   rc   dropout)r(   intermediate_sizerM   rT   r)   s       r*   r#   ImageGPTMLP.__init__  sZ    &&	,8	Y:&445zz&"4"45r,   r   r.   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r!   )r   r   r_   r   )r(   r   s     r*   r6   ImageGPTMLP.forward  s@    		-0/M2]3r,   )r   r   r_   r   )
r8   r9   r:   r;   r#   r%   r&   r6   r?   r@   rA   s   @r*   r   r     s(    6U\\ ell  r,   r   c                      ^  \ rS rSrSU 4S jjr      SS\R                  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\S-  S\	4S jjr
SrU =r$ )ImageGPTBlocki  Nc                   > [         TU ]  5         UR                  nUR                  b  UR                  OSU-  n[	        X1R
                  S9U l        [        XS9U l        [	        X1R
                  S9U l	        UR                  (       a(  [        USUS9U l        [	        X1R
                  S9U l        [        XA5      U l        g )N   r   rF   T)rE   rF   )r"   r#   r   n_innerr   layer_norm_epsilonln_1rC   attnln_2add_cross_attentioncrossattentionln_cross_attnr   mlp)r(   rM   rF   r   	inner_dimr)   s        r*   r#   ImageGPTBlock.__init__  s    ((&,nn&@FNNa+o	%k7P7PQ	%fB	%k7P7PQ	%%"3Ft_h"iD!2;D]D]!^Dy1r,   r   r   ry   r   r   r   r   r.   c           	      z   Un	U R                  U5      nU R                  UUUUUS9n
U
S   nU
SS  nX-   nUbW  [        U S5      (       d  [        SU  S35      eUn	U R	                  U5      nU R                  UUUUUUS9nUS   nX-   nXSS  -   nUn	U R                  U5      nU R                  U5      nX-   nU4U-   $ )N)r   ry   r   r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   ry   r   r   r   )r   r   r   rY   r   r   r   r   )r(   r   r   ry   r   r   r   r   r   residualattn_outputsr   outputscross_attn_outputsfeed_forward_hidden_statess                  r*   r6   ImageGPTBlock.forward.  s'    !		-0yy!)/ ! 
 #1oqr"#. ,4!122 =dV DZ Z  %H ..}=M!%!4!4%-&;'="3 "5 " -Q/K$2M12 66G 		-0%)XXm%<" ='))r,   )r   r   r   r   r   r   r!   r   )r8   r9   r:   r;   r#   r%   r&   r	   rR   r<   r6   r?   r@   rA   s   @r*   r   r     s    2$ $(.2596:!&).5*||5* DL5* t+	5*
  %||d25* !&t 35* $;5*  $;5* 
5* 5*r,   r   c                   r   ^  \ rS rSr% \\S'   SrSrSrSr	S/r
\R                  " 5       U 4S j5       rS	rU =r$ )
ImageGPTPreTrainedModelif  rM   transformer	input_ids)imageTr   c           
      z  > [         TU ]  U5        [        U[        5      (       a  UR	                  5        Hm  u  p#SU;   d  M  SU;   d  M  [
        R                  " USU R                  R                  [        R                  " SU R                  R                  -  5      -  S9  Mo     g[        U[        5      (       a  UR                  R                  n[
        R                  " UR                  [         R"                  " [         R$                  " XD4[         R&                  S95      R)                  SSXD5      5        gg)	zInitialize the weights.r_   r'   g        rL   )r4   stdrI   r   N)r"   _init_weightsr   r   named_parametersinitnormal_rM   initializer_rangemathr3   n_layerrC   rN   copy_rH   r%   rP   rQ   rR   rS   )r(   modulenamepre   r)   s        r*   r   %ImageGPTPreTrainedModel._init_weightso  s     	f% fo..!224t#D(8LL$++2O2ORVR[R[\]`d`k`k`s`s\sRt2tu 5  122"MMAAMJJ

5::}&DEJJWX]]q- 3r,    )r8   r9   r:   r;   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr%   no_gradr   r?   r@   rA   s   @r*   r   r   f  sA    %!O!&*#()
]]_ r,   r   c                   ~  ^  \ rS rSrS\4U 4S jjrS rS r\            SS\	R                  S-  S\S-  S	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\S-  S\S\\-  4S jj5       rSrU =r$ )ImageGPTModeli  rM   c           
      Z  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  U R                  5      U l        [        R
                  " UR                  U R                  5      U l	        [        R                  " UR                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[!        XS9PM     sn5      U l        [%        U R                  UR&                  S9U l        SU l        U R-                  5         g s  snf )Nr   r   F)r"   r#   r   rT   r   	Embedding
vocab_sizewterN   wper`   
embd_pdropdrop
ModuleListrangenum_hidden_layersr   hr   r   ln_fgradient_checkpointing	post_init)r(   rM   ir)   s      r*   r#   ImageGPTModel.__init__  s     ++<< 1 14>>B<< > >OJJv001	ERXRjRjLklLkqf BLklm%dnn&:S:ST	&+#  ms   D(c                     U R                   $ r!   r   )r(   s    r*   get_input_embeddings"ImageGPTModel.get_input_embeddings  s    xxr,   c                     Xl         g r!   r
  )r(   new_embeddingss     r*   set_input_embeddings"ImageGPTModel.set_input_embeddings  s    !r,   Nr   past_key_valuesry   token_type_idsposition_idsinputs_embedsr   r   r   r   output_hidden_statesreturn_dictr   r.   c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  Ub  [        S5      eUbF  U R                  X5        UR                  5       nUR                  SUS   5      nUR                  S   nO1Ub#  UR                  5       SS nUR                  S   nO[        S5      eUb  UR                  OUR                  nU R                  (       a/  U R                  (       a  U	(       a  [        R                  S5        Sn	Ub  UR                  SUS   5      nU	(       a  Uc  [        U R                   S9nUcA  Ub  UR!                  5       OSn["        R$                  " US   US	9U-   nUR'                  S5      nUb  UR                  US5      nUc  U R)                  U5      nU R+                  U5      nUUR-                  UR                  5      -   n[/        U R                   S
S5      (       a  [1        U R                   UUUS9nO[3        U R                   UUS9nUb  [3        U R                   UUUS9nUb  U R)                  U5      nUU-   nU R5                  U5      nUUR                  S5      4-   nU
(       a  SOSnU
(       a  U R                   R6                  (       a  SOSnU(       a  SOSn[9        U R:                  5       H\  u  nnU(       a  UU4-   nU" UUUUUU	U
S9nUS   nU
(       d  M-  UUS   4-   nU R                   R6                  (       d  MS  UUS   4-   nM^     U R=                  U5      nUR                  " U6 nU(       a  UU4-   nU(       d  [?        S UUUUU4 5       5      $ [A        UUUUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTModel
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer0   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rM   )rk   
is_decoder)rM   r  ry   r  )rM   r  ry   )rM   r  ry   r   r   )r   r   r   r   rL   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr!   r   ).0vs     r*   	<genexpr>(ImageGPTModel.forward.<locals>.<genexpr>=  s      wA ws   	)last_hidden_stater  r   
attentionscross_attentions)!rM   r   r  r   r  rY   %warn_if_padding_and_no_attention_maskrp   rS   r   rk   r  trainingloggerwarning_oncer
   get_seq_lengthr%   arange	unsqueezer   r   togetattrr   r   r   r   	enumerater  r  r<   r   )r(   r   r  ry   r  r  r  r   r   r   r   r  r  r   input_shape
batch_sizerk   past_seen_tokensposition_embedsr   token_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesr  blockr   s                               r*   r6   ImageGPTModel.forward  s   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++BYBY ]%>cdd"66yQ#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T&&4==##p "	%+00[_EN0*$++>OCRC^==?de <<BGJZZL'11!4L%+00R@N  HHY/M((<0%(:(:=;O;O(PP4;;e44/{{+- /	N 7{{+-N "-%>{{+5&;	&" % $ 8),==M		-0"m&8&8&<%>>$5b4%64;;;Z;Zr`d"6BD!$&&)HAu#$58H$H!%'=#"3G $AJM  &9WQZM&I#;;222+?71:-+O(% *( 		-0%**L9   1]4D D ':KM`bvw   9+++*1
 	
r,   )r   rT   r  r  r  r   r   )NNNNNNNNNNNN)r8   r9   r:   r;   r   r#   r  r  r   r%   r&   r	   rR   r   r<   r   r6   r?   r@   rA   s   @r*   r   r     sF   ~  "  *.(,.2.2,0-1596:!%)-,0#'g
<<$&g
 g
 t+	g

 t+g
 llT)g
 ||d*g
  %||d2g
 !&t 3g
 $;g
  $;g
 #Tkg
 D[g
 g
 
:	:g
 g
r,   r   z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            !         ^  \ rS rSrSS0rS\4U 4S jjr\             SS\R                  S-  S\
S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S\S-  S\S\\-  4S jj5       rSrU =r$ )ImageGPTForCausalImageModelingiL  zlm_head.weightztransformer.wte.weightrM   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  S-
  SS9U l        U R                  5         g )Nr   FrH   )
r"   r#   r   r   r   Linearn_embdr   lm_headr  r(   rM   r)   s     r*   r#   'ImageGPTForCausalImageModeling.__init__U  sL     (0yy0A0AA0EER 	r,   Nr   r  ry   r  r  r  r   r   labelsr   r   r  r  r   r.   c                 (   Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUUS9nUS   nU R                  U5      nSnU	br  USSS2SS24   R	                  5       nU	SSS24   R	                  5       n[        5       nU" UR                  SUR                  S5      5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
>>> import torch
>>> import matplotlib.pyplot as plt
>>> import numpy as np

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
>>> model.to(device)  # doctest: +IGNORE_RESULT

>>> # unconditional generation of 8 images
>>> batch_size = 4
>>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
>>> context = context.to(device)
>>> output = model.generate(
...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
... )

>>> clusters = image_processor.clusters
>>> height = image_processor.size["height"]
>>> width = image_processor.size["width"]

>>> samples = output[:, 1:].detach().cpu().numpy()
>>> samples_img = [
...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
... ]  # convert color cluster tokens back to pixels
>>> f, axes = plt.subplots(1, batch_size, dpi=300)

>>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
...     ax.axis("off")
...     ax.imshow(img)
```N)r  ry   r  r  r  r   r   r   r   r  r  r   .r0   r   )losslogitsr  r   r  r   )rM   r  r   r=  r   r   rS   rp   r   r  r   r  r   )r(   r   r  ry   r  r  r  r   r   r@  r   r   r  r  r   transformer_outputsr   	lm_logitsrB  shift_logitsshift_labelsloss_fctoutputs                          r*   r6   &ImageGPTForCausalImageModeling.forward]  sL   J &1%<k$++BYBY"..+))%'"7#9/!5# / 
 ,A.LL/	$S#2#q[1<<>L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`D\$7$;;F)-)9TGf$EvE0/??-;;*550AA
 	
r,   )r=  r   )NNNNNNNNNNNNN)r8   r9   r:   r;   _tied_weights_keysr   r#   r   r%   r&   r	   rR   r   r<   r   r6   r?   r@   rA   s   @r*   r8  r8  L  sa    +,DE~   *.(,.2.2,0-1596:&*!%)-,0#'l
<<$&l
 l
 t+	l

 t+l
 llT)l
 ||d*l
  %||d2l
 !&t 3l
 t#l
 $;l
  $;l
 #Tkl
 D[l
 l
  
2	2!l
 l
r,   r8  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                   R  ^  \ rS rSrS\4U 4S jjr\           SS\R                  S-  S\	S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\
S-  S\
S-  S\
S-  S\
S-  S\S\\-  4S jj5       rSrU =r$ )ImageGPTForImageClassificationi  rM   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g )NFr:  )
r"   r#   
num_labelsr   r   r   r;  r<  scorer  r>  s     r*   r#   'ImageGPTForImageClassification.__init__  sR      ++(0YYv}}dooEJ
 	r,   Nr   r  ry   r  r  r  r@  r   r   r  r  r   r.   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
US9
nUS   nUR                  SS9nU R	                  U5      nSnUb  U R                  UUU R                   5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
>>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
```N)	r  ry   r  r  r  r   r   r  r  r   r   rl   )rB  rC  r  r   r  )
rM   r  r   r4   rP  loss_functionr   r  r   r  )r(   r   r  ry   r  r  r  r@  r   r   r  r  r   rD  r   pooled_hidden_statesrC  rB  rI  s                      r*   r6   &ImageGPTForImageClassification.forward  s    f &1%<k$++BYBY"..+))%'/!5# / 
 ,A.,11a1801%%ffdkkBDY!4QR!88F)-)9TGf$EvE//??-;;*55
 	
r,   )rO  rP  r   )NNNNNNNNNNN)r8   r9   r:   r;   r   r#   r   r%   r&   r	   rR   r   r<   r   r6   r?   r@   rA   s   @r*   rM  rM    s%   ~   *.(,.2.2,0-1&*!%)-,0#'T
<<$&T
 T
 t+	T

 t+T
 llT)T
 ||d*T
 t#T
 $;T
  $;T
 #TkT
 D[T
 T
 
1	1T
 T
r,   rM  )r8  rM  r   r   )5__doc__r   typingr   r%   r   torch.nnr    r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   utils.genericr   configuration_imagegptr   
get_loggerr8   r#  Moduler   rC   r   r   r   r   r8  rM  __all__r   r,   r*   <module>rh     s@   %     % & ! C C ) J 9 
 . # 
 , 2 
		H	%
		 
M)		 M)`")) "E*. E*P o  D 
+ 
 
D x
%<o x
x
v _
%< _
_
Dr,   