//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-33191640
// Cuda compilation tools, release 12.2, V12.2.140
// Based on NVVM 7.0.1
//

.version 8.2
.target sm_52
.address_size 64

	// .globl	decode
.extern .shared .align 16 .b8 shared_mem[];

.visible .entry decode(
	.param .u64 decode_param_0,
	.param .u64 decode_param_1,
	.param .u64 decode_param_2,
	.param .u64 decode_param_3,
	.param .u64 decode_param_4,
	.param .u64 decode_param_5,
	.param .u32 decode_param_6,
	.param .u32 decode_param_7,
	.param .u32 decode_param_8
)
{
	.reg .pred 	%p<56>;
	.reg .b16 	%rs<112>;
	.reg .b32 	%r<359>;
	.reg .b64 	%rd<226>;


	ld.param.u64 	%rd44, [decode_param_0];
	ld.param.u64 	%rd48, [decode_param_1];
	ld.param.u64 	%rd49, [decode_param_2];
	ld.param.u64 	%rd50, [decode_param_3];
	ld.param.u64 	%rd45, [decode_param_4];
	ld.param.u64 	%rd46, [decode_param_5];
	ld.param.u32 	%r79, [decode_param_6];
	ld.param.u32 	%r80, [decode_param_7];
	ld.param.u32 	%r81, [decode_param_8];
	cvta.to.global.u64 	%rd1, %rd49;
	cvta.to.global.u64 	%rd2, %rd50;
	cvta.to.global.u64 	%rd3, %rd48;
	mov.u32 	%r1, %ntid.x;
	shl.b32 	%r2, %r1, 2;
	mov.u32 	%r3, %ctaid.x;
	mov.u32 	%r358, %tid.x;
	mad.lo.s32 	%r5, %r3, %r1, %r358;
	shl.b32 	%r6, %r5, 3;
	setp.ge.s32 	%p1, %r6, %r80;
	mov.u64 	%rd212, 0;
	mov.u64 	%rd211, %rd212;
	@%p1 bra 	$L__BB0_2;

	cvt.s64.s32 	%rd51, %r6;
	add.s64 	%rd52, %rd3, %rd51;
	ld.global.nc.u8 	%rs29, [%rd52];
	cvt.u64.u16 	%rd53, %rs29;
	shl.b64 	%rd211, %rd53, 56;

$L__BB0_2:
	add.s32 	%r82, %r6, 1;
	setp.ge.s32 	%p2, %r82, %r80;
	cvt.s64.s32 	%rd55, %r6;
	add.s64 	%rd6, %rd3, %rd55;
	@%p2 bra 	$L__BB0_4;

	ld.global.nc.u8 	%rs30, [%rd6+1];
	cvt.u64.u16 	%rd56, %rs30;
	shl.b64 	%rd57, %rd56, 48;
	and.b64  	%rd212, %rd57, 71776119061217280;

$L__BB0_4:
	add.s32 	%r83, %r6, 2;
	setp.ge.s32 	%p3, %r83, %r80;
	mov.u64 	%rd214, 0;
	mov.u64 	%rd213, %rd214;
	@%p3 bra 	$L__BB0_6;

	ld.global.nc.u8 	%rs31, [%rd6+2];
	cvt.u64.u16 	%rd59, %rs31;
	shl.b64 	%rd60, %rd59, 40;
	and.b64  	%rd213, %rd60, 280375465082880;

$L__BB0_6:
	add.s32 	%r84, %r6, 3;
	setp.ge.s32 	%p4, %r84, %r80;
	@%p4 bra 	$L__BB0_8;

	ld.global.nc.u8 	%rs32, [%rd6+3];
	cvt.u64.u16 	%rd62, %rs32;
	shl.b64 	%rd63, %rd62, 32;
	and.b64  	%rd214, %rd63, 1095216660480;

$L__BB0_8:
	add.s32 	%r85, %r6, 4;
	setp.ge.s32 	%p5, %r85, %r80;
	mov.u64 	%rd216, 0;
	mov.u64 	%rd215, %rd216;
	@%p5 bra 	$L__BB0_10;

	ld.global.nc.u8 	%rs33, [%rd6+4];
	cvt.u32.u16 	%r86, %rs33;
	and.b32  	%r87, %r86, 255;
	mul.wide.u32 	%rd215, %r87, 16777216;

$L__BB0_10:
	add.s32 	%r88, %r6, 5;
	setp.ge.s32 	%p6, %r88, %r80;
	@%p6 bra 	$L__BB0_12;

	ld.global.nc.u8 	%rs34, [%rd6+5];
	cvt.u32.u16 	%r89, %rs34;
	and.b32  	%r90, %r89, 255;
	mul.wide.u32 	%rd216, %r90, 65536;

$L__BB0_12:
	add.s32 	%r91, %r6, 6;
	setp.ge.s32 	%p7, %r91, %r80;
	mov.u64 	%rd218, 0;
	mov.u64 	%rd217, %rd218;
	@%p7 bra 	$L__BB0_14;

	ld.global.nc.u8 	%rs35, [%rd6+6];
	cvt.u32.u16 	%r92, %rs35;
	and.b32  	%r93, %r92, 255;
	mul.wide.u32 	%rd217, %r93, 256;

$L__BB0_14:
	add.s32 	%r94, %r6, 7;
	setp.ge.s32 	%p8, %r94, %r80;
	@%p8 bra 	$L__BB0_16;

	ld.global.nc.u8 	%rs36, [%rd6+7];
	cvt.u64.u16 	%rd68, %rs36;
	and.b64  	%rd218, %rd68, 255;

$L__BB0_16:
	add.s32 	%r96, %r6, 8;
	setp.ge.s32 	%p9, %r96, %r80;
	mov.u32 	%r340, 0;
	mov.u32 	%r339, %r340;
	@%p9 bra 	$L__BB0_18;

	ld.global.nc.u8 	%rs37, [%rd6+8];
	cvt.u32.u16 	%r97, %rs37;
	shl.b32 	%r339, %r97, 24;

$L__BB0_18:
	add.s32 	%r99, %r6, 9;
	setp.ge.s32 	%p10, %r99, %r80;
	@%p10 bra 	$L__BB0_20;

	ld.global.nc.u8 	%rs38, [%rd6+9];
	cvt.u32.u16 	%r100, %rs38;
	shl.b32 	%r101, %r100, 16;
	and.b32  	%r340, %r101, 16711680;

$L__BB0_20:
	add.s32 	%r103, %r6, 10;
	setp.ge.s32 	%p11, %r103, %r80;
	mov.u32 	%r342, 0;
	mov.u32 	%r341, %r342;
	@%p11 bra 	$L__BB0_22;

	ld.global.nc.u8 	%rs39, [%rd6+10];
	mul.wide.u16 	%r341, %rs39, 256;

$L__BB0_22:
	add.s32 	%r105, %r6, 11;
	setp.ge.s32 	%p12, %r105, %r80;
	@%p12 bra 	$L__BB0_24;

	ld.global.nc.u8 	%rs41, [%rd6+11];
	cvt.u32.u16 	%r106, %rs41;
	and.b32  	%r342, %r106, 255;

$L__BB0_24:
	mov.u32 	%r108, shared_mem;
	add.s32 	%r17, %r108, %r2;
	add.s32 	%r15, %r17, 4;
	bar.sync 	0;
	mul.lo.s32 	%r109, %r5, 5;
	shr.s32 	%r110, %r109, 31;
	shr.u32 	%r111, %r110, 29;
	add.s32 	%r112, %r109, %r111;
	shr.s32 	%r113, %r112, 3;
	cvt.s64.s32 	%rd69, %r113;
	cvta.to.global.u64 	%rd70, %rd45;
	add.s64 	%rd71, %rd70, %rd69;
	ld.global.nc.u8 	%rs42, [%rd71+1];
	cvt.u32.u16 	%r114, %rs42;
	and.b32  	%r115, %r114, 255;
	ld.global.nc.u8 	%rs43, [%rd71];
	mov.u32 	%r347, 0;
	cvt.u32.u16 	%r116, %rs43;
	prmt.b32 	%r117, %r116, %r115, 30212;
	and.b32  	%r118, %r112, -8;
	sub.s32 	%r119, %r118, %r109;
	add.s32 	%r120, %r119, 11;
	shr.u32 	%r121, %r117, %r120;
	cvt.u64.u32 	%rd72, %r121;
	cvt.u16.u32 	%rs44, %r121;
	and.b16  	%rs107, %rs44, 31;
	and.b64  	%rd73, %rd214, 1095216660480;
	or.b64  	%rd74, %rd73, %rd213;
	or.b64  	%rd75, %rd74, %rd215;
	or.b64  	%rd76, %rd75, %rd216;
	and.b64  	%rd77, %rd76, 281474976645120;
	or.b64  	%rd78, %rd77, %rd217;
	or.b64  	%rd79, %rd78, %rd218;
	or.b64  	%rd21, %rd212, %rd211;
	or.b64  	%rd80, %rd21, %rd79;
	and.b64  	%rd222, %rd72, 31;
	and.b32  	%r122, %r121, 31;
	shl.b64 	%rd219, %rd80, %r122;
	shl.b32 	%r123, %r79, 8;
	add.s32 	%r16, %r123, -256;
	cvta.to.global.u64 	%rd24, %rd46;
	mov.u16 	%rs105, %rs107;

$L__BB0_25:
	shr.u64 	%rd82, %rd219, 56;
	add.s64 	%rd81, %rd44, %rd82;
	// begin inline asm
	ld.global.nc.u8 %r344, [%rd81];
	// end inline asm
	cvt.u16.u32 	%rs45, %r344;
	and.b16  	%rs46, %rs45, 255;
	setp.lt.u16 	%p13, %rs46, 240;
	@%p13 bra 	$L__BB0_29;

	shl.b32 	%r126, %r344, 8;
	and.b32  	%r127, %r126, 65280;
	mov.u32 	%r128, 65536;
	sub.s32 	%r129, %r128, %r127;
	cvt.u64.u32 	%rd84, %r129;
	bfe.u64 	%rd85, %rd219, 48, 8;
	or.b64  	%rd86, %rd85, %rd84;
	add.s64 	%rd83, %rd44, %rd86;
	// begin inline asm
	ld.global.nc.u8 %r344, [%rd83];
	// end inline asm
	cvt.u16.u32 	%rs47, %r344;
	and.b16  	%rs48, %rs47, 255;
	setp.lt.u16 	%p14, %rs48, 240;
	@%p14 bra 	$L__BB0_29;

	shl.b32 	%r131, %r344, 8;
	and.b32  	%r132, %r131, 65280;
	sub.s32 	%r134, %r128, %r132;
	cvt.u64.u32 	%rd88, %r134;
	bfe.u64 	%rd89, %rd219, 40, 8;
	or.b64  	%rd90, %rd89, %rd88;
	add.s64 	%rd87, %rd44, %rd90;
	// begin inline asm
	ld.global.nc.u8 %r344, [%rd87];
	// end inline asm
	cvt.u16.u32 	%rs49, %r344;
	and.b16  	%rs50, %rs49, 255;
	setp.lt.u16 	%p15, %rs50, 240;
	@%p15 bra 	$L__BB0_29;

	shl.b32 	%r136, %r344, 8;
	and.b32  	%r137, %r136, 65280;
	mov.u32 	%r138, 65536;
	sub.s32 	%r139, %r138, %r137;
	cvt.u64.u32 	%rd92, %r139;
	shr.u64 	%rd93, %rd219, 32;
	and.b64  	%rd94, %rd93, 255;
	or.b64  	%rd95, %rd94, %rd92;
	add.s64 	%rd91, %rd44, %rd95;
	// begin inline asm
	ld.global.nc.u8 %r344, [%rd91];
	// end inline asm

$L__BB0_29:
	add.s32 	%r347, %r347, 1;
	and.b32  	%r141, %r344, 255;
	add.s32 	%r142, %r16, %r141;
	cvt.s64.s32 	%rd97, %r142;
	add.s64 	%rd96, %rd44, %rd97;
	// begin inline asm
	ld.global.nc.u8 %r140, [%rd96];
	// end inline asm
	cvt.u16.u32 	%rs51, %r140;
	and.b32  	%r143, %r140, 255;
	shl.b64 	%rd219, %rd219, %r143;
	add.s16 	%rs105, %rs105, %rs51;
	and.b16  	%rs52, %rs105, 255;
	setp.lt.u16 	%p16, %rs52, 32;
	@%p16 bra 	$L__BB0_25;

	or.b32  	%r144, %r340, %r339;
	or.b32  	%r25, %r144, %r341;
	or.b32  	%r357, %r25, %r342;
	cvt.u64.u32 	%rd27, %r357;
	cvt.u64.u16 	%rd98, %rs105;
	and.b64  	%rd99, %rd98, 255;
	add.s64 	%rd100, %rd99, 4294967264;
	cvt.u32.u64 	%r145, %rd100;
	shl.b64 	%rd101, %rd27, %r145;
	or.b64  	%rd220, %rd101, %rd219;
	add.s16 	%rs106, %rs105, -32;
	and.b16  	%rs53, %rs106, 248;
	shr.u16 	%rs54, %rs53, 3;
	add.s16 	%rs55, %rs54, 4;
	setp.gt.u16 	%p17, %rs55, 7;
	@%p17 bra 	$L__BB0_36;

$L__BB0_31:
	shr.u64 	%rd103, %rd220, 56;
	add.s64 	%rd102, %rd44, %rd103;
	// begin inline asm
	ld.global.nc.u8 %r346, [%rd102];
	// end inline asm
	cvt.u16.u32 	%rs56, %r346;
	and.b16  	%rs57, %rs56, 255;
	setp.lt.u16 	%p18, %rs57, 240;
	@%p18 bra 	$L__BB0_35;

	shl.b32 	%r148, %r346, 8;
	and.b32  	%r149, %r148, 65280;
	mov.u32 	%r150, 65536;
	sub.s32 	%r151, %r150, %r149;
	cvt.u64.u32 	%rd105, %r151;
	bfe.u64 	%rd106, %rd220, 48, 8;
	or.b64  	%rd107, %rd106, %rd105;
	add.s64 	%rd104, %rd44, %rd107;
	// begin inline asm
	ld.global.nc.u8 %r346, [%rd104];
	// end inline asm
	cvt.u16.u32 	%rs58, %r346;
	and.b16  	%rs59, %rs58, 255;
	setp.lt.u16 	%p19, %rs59, 240;
	@%p19 bra 	$L__BB0_35;

	shl.b32 	%r153, %r346, 8;
	and.b32  	%r154, %r153, 65280;
	sub.s32 	%r156, %r150, %r154;
	cvt.u64.u32 	%rd109, %r156;
	bfe.u64 	%rd110, %rd220, 40, 8;
	or.b64  	%rd111, %rd110, %rd109;
	add.s64 	%rd108, %rd44, %rd111;
	// begin inline asm
	ld.global.nc.u8 %r346, [%rd108];
	// end inline asm
	cvt.u16.u32 	%rs60, %r346;
	and.b16  	%rs61, %rs60, 255;
	setp.lt.u16 	%p20, %rs61, 240;
	@%p20 bra 	$L__BB0_35;

	shl.b32 	%r158, %r346, 8;
	and.b32  	%r159, %r158, 65280;
	mov.u32 	%r160, 65536;
	sub.s32 	%r161, %r160, %r159;
	cvt.u64.u32 	%rd113, %r161;
	shr.u64 	%rd114, %rd220, 32;
	and.b64  	%rd115, %rd114, 255;
	or.b64  	%rd116, %rd115, %rd113;
	add.s64 	%rd112, %rd44, %rd116;
	// begin inline asm
	ld.global.nc.u8 %r346, [%rd112];
	// end inline asm

$L__BB0_35:
	add.s32 	%r347, %r347, 1;
	and.b32  	%r163, %r346, 255;
	add.s32 	%r164, %r16, %r163;
	cvt.s64.s32 	%rd118, %r164;
	add.s64 	%rd117, %rd44, %rd118;
	// begin inline asm
	ld.global.nc.u8 %r162, [%rd117];
	// end inline asm
	cvt.u16.u32 	%rs62, %r162;
	and.b32  	%r165, %r162, 255;
	shl.b64 	%rd220, %rd220, %r165;
	add.s16 	%rs106, %rs106, %rs62;
	and.b16  	%rs63, %rs106, 248;
	shr.u16 	%rs64, %rs63, 3;
	add.s16 	%rs65, %rs64, 4;
	setp.lt.u16 	%p21, %rs65, 8;
	@%p21 bra 	$L__BB0_31;

$L__BB0_36:
	shl.b32 	%r166, %r358, 2;
	add.s32 	%r35, %r108, %r166;
	mul.wide.u32 	%rd119, %r3, 4;
	add.s64 	%rd31, %rd2, %rd119;
	setp.eq.s32 	%p22, %r358, 0;
	@%p22 bra 	$L__BB0_38;

	st.volatile.shared.u32 	[%r35], %r347;
	bra.uni 	$L__BB0_39;

$L__BB0_38:
	ld.global.nc.u32 	%r168, [%rd31];
	add.s32 	%r169, %r168, %r347;
	st.volatile.shared.u32 	[shared_mem], %r169;

$L__BB0_39:
	bar.sync 	0;
	setp.lt.u32 	%p23, %r1, 2;
	@%p23 bra 	$L__BB0_44;

	add.s32 	%r36, %r358, 1;
	mov.u32 	%r348, 2;

$L__BB0_41:
	add.s32 	%r171, %r348, -1;
	and.b32  	%r172, %r171, %r36;
	setp.ne.s32 	%p24, %r172, 0;
	@%p24 bra 	$L__BB0_43;

	shr.u32 	%r173, %r348, 1;
	sub.s32 	%r174, %r358, %r173;
	shl.b32 	%r175, %r174, 2;
	add.s32 	%r177, %r108, %r175;
	ld.volatile.shared.u32 	%r178, [%r35];
	ld.volatile.shared.u32 	%r179, [%r177];
	add.s32 	%r180, %r178, %r179;
	st.volatile.shared.u32 	[%r35], %r180;

$L__BB0_43:
	bar.sync 	0;
	shl.b32 	%r348, %r348, 1;
	setp.le.u32 	%p25, %r348, %r1;
	@%p25 bra 	$L__BB0_41;

$L__BB0_44:
	setp.ne.s32 	%p26, %r358, 0;
	@%p26 bra 	$L__BB0_46;

	mov.u32 	%r183, 0;
	st.volatile.shared.u32 	[%r17+-4], %r183;

$L__BB0_46:
	bar.sync 	0;
	setp.lt.s32 	%p27, %r1, 2;
	@%p27 bra 	$L__BB0_51;

	add.s32 	%r40, %r358, 1;
	mov.u32 	%r349, %r1;

$L__BB0_48:
	add.s32 	%r184, %r349, -1;
	and.b32  	%r185, %r184, %r40;
	setp.eq.s32 	%p28, %r185, 0;
	@%p28 bra 	$L__BB0_49;
	bra.uni 	$L__BB0_50;

$L__BB0_49:
	shr.u32 	%r186, %r349, 1;
	sub.s32 	%r187, %r358, %r186;
	shl.b32 	%r188, %r187, 2;
	add.s32 	%r190, %r108, %r188;
	ld.volatile.shared.u32 	%r191, [%r35];
	ld.volatile.shared.u32 	%r192, [%r190];
	add.s32 	%r193, %r191, %r192;
	st.volatile.shared.u32 	[%r35], %r193;
	ld.volatile.shared.u32 	%r194, [%r190];
	ld.volatile.shared.u32 	%r195, [%r35];
	sub.s32 	%r196, %r195, %r194;
	st.volatile.shared.u32 	[%r190], %r196;

$L__BB0_50:
	shr.u32 	%r42, %r349, 1;
	bar.sync 	0;
	setp.gt.u32 	%p29, %r349, 3;
	mov.u32 	%r349, %r42;
	@%p29 bra 	$L__BB0_48;

$L__BB0_51:
	@%p26 bra 	$L__BB0_53;

	ld.global.nc.u32 	%r197, [%rd31];
	st.volatile.shared.u32 	[shared_mem], %r197;
	add.s32 	%r198, %r3, 1;
	mul.wide.u32 	%rd120, %r198, 4;
	add.s64 	%rd121, %rd2, %rd120;
	ld.global.nc.u32 	%r199, [%rd121];
	st.volatile.shared.u32 	[%r17], %r199;

$L__BB0_53:
	bar.sync 	0;
	ld.volatile.shared.u32 	%r43, [shared_mem];
	ld.volatile.shared.u32 	%r350, [%r35];
	add.s32 	%r200, %r350, %r347;
	min.u32 	%r45, %r200, %r81;
	or.b64  	%rd122, %rd215, %rd214;
	or.b64  	%rd123, %rd122, %rd216;
	or.b64  	%rd124, %rd123, %rd217;
	and.b64  	%rd125, %rd124, 1099511627520;
	or.b64  	%rd126, %rd125, %rd218;
	and.b64  	%rd127, %rd213, 280375465082880;
	or.b64  	%rd128, %rd21, %rd127;
	or.b64  	%rd129, %rd128, %rd126;
	cvt.u32.u64 	%r201, %rd222;
	shl.b64 	%rd221, %rd129, %r201;
	setp.ge.u32 	%p31, %r350, %r45;
	@%p31 bra 	$L__BB0_61;

	mov.u32 	%r351, %r357;

$L__BB0_55:
	shr.u64 	%rd131, %rd221, 56;
	add.s64 	%rd130, %rd44, %rd131;
	// begin inline asm
	ld.global.nc.u8 %r202, [%rd130];
	// end inline asm
	cvt.u16.u32 	%rs108, %r202;
	and.b16  	%rs66, %rs108, 255;
	setp.lt.u16 	%p32, %rs66, 240;
	@%p32 bra 	$L__BB0_59;

	shl.b32 	%r204, %r202, 8;
	and.b32  	%r205, %r204, 65280;
	mov.u32 	%r206, 65536;
	sub.s32 	%r207, %r206, %r205;
	cvt.u64.u32 	%rd133, %r207;
	bfe.u64 	%rd134, %rd221, 48, 8;
	or.b64  	%rd135, %rd134, %rd133;
	add.s64 	%rd132, %rd44, %rd135;
	// begin inline asm
	ld.global.nc.u8 %r203, [%rd132];
	// end inline asm
	cvt.u16.u32 	%rs108, %r203;
	and.b16  	%rs67, %rs108, 255;
	setp.lt.u16 	%p33, %rs67, 240;
	@%p33 bra 	$L__BB0_59;

	shl.b32 	%r209, %r203, 8;
	and.b32  	%r210, %r209, 65280;
	sub.s32 	%r212, %r206, %r210;
	cvt.u64.u32 	%rd137, %r212;
	bfe.u64 	%rd138, %rd221, 40, 8;
	or.b64  	%rd139, %rd138, %rd137;
	add.s64 	%rd136, %rd44, %rd139;
	// begin inline asm
	ld.global.nc.u8 %r208, [%rd136];
	// end inline asm
	cvt.u16.u32 	%rs108, %r208;
	and.b16  	%rs68, %rs108, 255;
	setp.lt.u16 	%p34, %rs68, 240;
	@%p34 bra 	$L__BB0_59;

	shl.b32 	%r214, %r208, 8;
	and.b32  	%r215, %r214, 65280;
	mov.u32 	%r216, 65536;
	sub.s32 	%r217, %r216, %r215;
	cvt.u64.u32 	%rd141, %r217;
	shr.u64 	%rd142, %rd221, 32;
	and.b64  	%rd143, %rd142, 255;
	or.b64  	%rd144, %rd143, %rd141;
	add.s64 	%rd140, %rd44, %rd144;
	// begin inline asm
	ld.global.nc.u8 %r213, [%rd140];
	// end inline asm
	cvt.u16.u32 	%rs108, %r213;

$L__BB0_59:
	cvt.u64.u32 	%rd146, %r350;
	add.s64 	%rd147, %rd1, %rd146;
	ld.global.nc.u8 	%rs69, [%rd147];
	and.b16  	%rs70, %rs69, 128;
	and.b16  	%rs71, %rs108, 254;
	shr.u16 	%rs72, %rs71, 1;
	or.b16  	%rs73, %rs70, %rs72;
	mul.wide.u16 	%r219, %rs73, 256;
	and.b16  	%rs74, %rs69, 127;
	cvt.u32.u16 	%r220, %rs74;
	cvt.u32.u16 	%r221, %rs108;
	bfi.b32 	%r222, %r221, %r220, 7, 9;
	and.b32  	%r223, %r222, 255;
	and.b32  	%r224, %r351, -65536;
	or.b32  	%r225, %r224, %r223;
	or.b32  	%r351, %r225, %r219;
	sub.s32 	%r226, %r350, %r43;
	shl.b32 	%r227, %r226, 1;
	add.s32 	%r228, %r15, %r227;
	st.volatile.shared.u16 	[%r228], %r351;
	and.b32  	%r229, %r221, 255;
	add.s32 	%r230, %r16, %r229;
	cvt.s64.s32 	%rd148, %r230;
	add.s64 	%rd145, %rd44, %rd148;
	// begin inline asm
	ld.global.nc.u8 %r218, [%rd145];
	// end inline asm
	cvt.u16.u32 	%rs75, %r218;
	and.b32  	%r231, %r218, 255;
	shl.b64 	%rd221, %rd221, %r231;
	add.s16 	%rs107, %rs107, %rs75;
	and.b16  	%rs76, %rs107, 255;
	setp.lt.u16 	%p35, %rs76, 32;
	add.s32 	%r350, %r350, 1;
	setp.lt.u32 	%p36, %r350, %r45;
	and.pred  	%p37, %p35, %p36;
	@%p37 bra 	$L__BB0_55;

	cvt.u64.u16 	%rd149, %rs107;
	and.b64  	%rd222, %rd149, 255;

$L__BB0_61:
	add.s64 	%rd150, %rd222, 4294967264;
	cvt.u32.u64 	%r232, %rd150;
	shl.b64 	%rd151, %rd27, %r232;
	or.b64  	%rd225, %rd151, %rd221;
	setp.le.u32 	%p38, %r45, %r350;
	@%p38 bra 	$L__BB0_79;

	sub.s32 	%r233, %r45, %r350;
	and.b32  	%r234, %r233, 1;
	setp.eq.b32 	%p39, %r234, 1;
	mov.pred 	%p40, 0;
	xor.pred  	%p41, %p39, %p40;
	not.pred 	%p42, %p41;
	mov.u32 	%r356, %r350;
	@%p42 bra 	$L__BB0_68;

	shr.u64 	%rd153, %rd225, 56;
	add.s64 	%rd152, %rd44, %rd153;
	// begin inline asm
	ld.global.nc.u8 %r235, [%rd152];
	// end inline asm
	cvt.u16.u32 	%rs109, %r235;
	and.b16  	%rs77, %rs109, 255;
	setp.lt.u16 	%p43, %rs77, 240;
	@%p43 bra 	$L__BB0_67;

	shl.b32 	%r237, %r235, 8;
	and.b32  	%r238, %r237, 65280;
	mov.u32 	%r239, 65536;
	sub.s32 	%r240, %r239, %r238;
	cvt.u64.u32 	%rd155, %r240;
	bfe.u64 	%rd156, %rd225, 48, 8;
	or.b64  	%rd157, %rd156, %rd155;
	add.s64 	%rd154, %rd44, %rd157;
	// begin inline asm
	ld.global.nc.u8 %r236, [%rd154];
	// end inline asm
	cvt.u16.u32 	%rs109, %r236;
	and.b16  	%rs78, %rs109, 255;
	setp.lt.u16 	%p44, %rs78, 240;
	@%p44 bra 	$L__BB0_67;

	shl.b32 	%r242, %r236, 8;
	and.b32  	%r243, %r242, 65280;
	sub.s32 	%r245, %r239, %r243;
	cvt.u64.u32 	%rd159, %r245;
	bfe.u64 	%rd160, %rd225, 40, 8;
	or.b64  	%rd161, %rd160, %rd159;
	add.s64 	%rd158, %rd44, %rd161;
	// begin inline asm
	ld.global.nc.u8 %r241, [%rd158];
	// end inline asm
	cvt.u16.u32 	%rs109, %r241;
	and.b16  	%rs79, %rs109, 255;
	setp.lt.u16 	%p45, %rs79, 240;
	@%p45 bra 	$L__BB0_67;

	shl.b32 	%r247, %r241, 8;
	and.b32  	%r248, %r247, 65280;
	mov.u32 	%r249, 65536;
	sub.s32 	%r250, %r249, %r248;
	cvt.u64.u32 	%rd163, %r250;
	shr.u64 	%rd164, %rd225, 32;
	and.b64  	%rd165, %rd164, 255;
	or.b64  	%rd166, %rd165, %rd163;
	add.s64 	%rd162, %rd44, %rd166;
	// begin inline asm
	ld.global.nc.u8 %r246, [%rd162];
	// end inline asm
	cvt.u16.u32 	%rs109, %r246;

$L__BB0_67:
	cvt.u64.u32 	%rd168, %r350;
	add.s64 	%rd169, %rd1, %rd168;
	ld.global.nc.u8 	%rs80, [%rd169];
	and.b16  	%rs81, %rs80, 128;
	and.b16  	%rs82, %rs109, 254;
	shr.u16 	%rs83, %rs82, 1;
	or.b16  	%rs84, %rs81, %rs83;
	mul.wide.u16 	%r252, %rs84, 256;
	and.b16  	%rs85, %rs80, 127;
	cvt.u32.u16 	%r253, %rs85;
	cvt.u32.u16 	%r254, %rs109;
	bfi.b32 	%r255, %r254, %r253, 7, 9;
	and.b32  	%r256, %r255, 255;
	and.b32  	%r257, %r25, -65536;
	or.b32  	%r258, %r257, %r256;
	or.b32  	%r357, %r258, %r252;
	sub.s32 	%r259, %r350, %r43;
	shl.b32 	%r260, %r259, 1;
	add.s32 	%r261, %r15, %r260;
	st.volatile.shared.u16 	[%r261], %r357;
	add.s32 	%r356, %r350, 1;
	and.b32  	%r262, %r254, 255;
	add.s32 	%r263, %r16, %r262;
	cvt.s64.s32 	%rd170, %r263;
	add.s64 	%rd167, %rd44, %rd170;
	// begin inline asm
	ld.global.nc.u8 %r251, [%rd167];
	// end inline asm
	and.b32  	%r264, %r251, 255;
	shl.b64 	%rd225, %rd225, %r264;

$L__BB0_68:
	mov.u32 	%r265, -2;
	sub.s32 	%r266, %r265, %r350;
	not.b32 	%r267, %r45;
	setp.eq.s32 	%p46, %r266, %r267;
	@%p46 bra 	$L__BB0_79;

	shl.b32 	%r269, %r356, 1;
	add.s32 	%r270, %r2, %r269;
	shl.b32 	%r271, %r43, 1;
	sub.s32 	%r272, %r270, %r271;
	add.s32 	%r274, %r108, %r272;
	add.s32 	%r61, %r274, 1;
	mov.u32 	%r355, 0;

$L__BB0_70:
	shr.u64 	%rd172, %rd225, 56;
	add.s64 	%rd171, %rd44, %rd172;
	// begin inline asm
	ld.global.nc.u8 %r275, [%rd171];
	// end inline asm
	cvt.u16.u32 	%rs110, %r275;
	and.b16  	%rs86, %rs110, 255;
	setp.lt.u16 	%p47, %rs86, 240;
	@%p47 bra 	$L__BB0_74;

	shl.b32 	%r277, %r275, 8;
	and.b32  	%r278, %r277, 65280;
	mov.u32 	%r279, 65536;
	sub.s32 	%r280, %r279, %r278;
	cvt.u64.u32 	%rd174, %r280;
	bfe.u64 	%rd175, %rd225, 48, 8;
	or.b64  	%rd176, %rd175, %rd174;
	add.s64 	%rd173, %rd44, %rd176;
	// begin inline asm
	ld.global.nc.u8 %r276, [%rd173];
	// end inline asm
	cvt.u16.u32 	%rs110, %r276;
	and.b16  	%rs87, %rs110, 255;
	setp.lt.u16 	%p48, %rs87, 240;
	@%p48 bra 	$L__BB0_74;

	shl.b32 	%r282, %r276, 8;
	and.b32  	%r283, %r282, 65280;
	sub.s32 	%r285, %r279, %r283;
	cvt.u64.u32 	%rd178, %r285;
	bfe.u64 	%rd179, %rd225, 40, 8;
	or.b64  	%rd180, %rd179, %rd178;
	add.s64 	%rd177, %rd44, %rd180;
	// begin inline asm
	ld.global.nc.u8 %r281, [%rd177];
	// end inline asm
	cvt.u16.u32 	%rs110, %r281;
	and.b16  	%rs88, %rs110, 255;
	setp.lt.u16 	%p49, %rs88, 240;
	@%p49 bra 	$L__BB0_74;

	shl.b32 	%r287, %r281, 8;
	and.b32  	%r288, %r287, 65280;
	mov.u32 	%r289, 65536;
	sub.s32 	%r290, %r289, %r288;
	cvt.u64.u32 	%rd182, %r290;
	shr.u64 	%rd183, %rd225, 32;
	and.b64  	%rd184, %rd183, 255;
	or.b64  	%rd185, %rd184, %rd182;
	add.s64 	%rd181, %rd44, %rd185;
	// begin inline asm
	ld.global.nc.u8 %r286, [%rd181];
	// end inline asm
	cvt.u16.u32 	%rs110, %r286;

$L__BB0_74:
	cvt.u64.u32 	%rd188, %r356;
	add.s64 	%rd189, %rd1, %rd188;
	ld.global.nc.u8 	%rs89, [%rd189];
	and.b16  	%rs90, %rs89, -128;
	and.b16  	%rs91, %rs110, 254;
	shr.u16 	%rs92, %rs91, 1;
	or.b16  	%rs93, %rs90, %rs92;
	cvt.u32.u16 	%r293, %rs93;
	and.b16  	%rs94, %rs89, 127;
	cvt.u32.u16 	%r294, %rs94;
	and.b32  	%r68, %r357, -65536;
	cvt.u32.u16 	%r295, %rs110;
	bfi.b32 	%r296, %r295, %r294, 7, 9;
	and.b32  	%r297, %r296, 255;
	prmt.b32 	%r298, %r293, %r297, 8452;
	add.s32 	%r299, %r61, %r355;
	st.volatile.shared.u16 	[%r299+3], %r298;
	and.b32  	%r300, %r295, 255;
	add.s32 	%r301, %r16, %r300;
	cvt.s64.s32 	%rd190, %r301;
	add.s64 	%rd186, %rd44, %rd190;
	// begin inline asm
	ld.global.nc.u8 %r291, [%rd186];
	// end inline asm
	and.b32  	%r302, %r291, 255;
	shl.b64 	%rd42, %rd225, %r302;
	shr.u64 	%rd191, %rd42, 56;
	add.s64 	%rd187, %rd44, %rd191;
	// begin inline asm
	ld.global.nc.u8 %r292, [%rd187];
	// end inline asm
	cvt.u16.u32 	%rs111, %r292;
	and.b16  	%rs95, %rs111, 255;
	setp.lt.u16 	%p50, %rs95, 240;
	@%p50 bra 	$L__BB0_78;

	shl.b32 	%r304, %r292, 8;
	and.b32  	%r305, %r304, 65280;
	mov.u32 	%r306, 65536;
	sub.s32 	%r307, %r306, %r305;
	cvt.u64.u32 	%rd193, %r307;
	bfe.u64 	%rd194, %rd42, 48, 8;
	or.b64  	%rd195, %rd194, %rd193;
	add.s64 	%rd192, %rd44, %rd195;
	// begin inline asm
	ld.global.nc.u8 %r303, [%rd192];
	// end inline asm
	cvt.u16.u32 	%rs111, %r303;
	and.b16  	%rs96, %rs111, 255;
	setp.lt.u16 	%p51, %rs96, 240;
	@%p51 bra 	$L__BB0_78;

	shl.b32 	%r309, %r303, 8;
	and.b32  	%r310, %r309, 65280;
	sub.s32 	%r312, %r306, %r310;
	cvt.u64.u32 	%rd197, %r312;
	bfe.u64 	%rd198, %rd42, 40, 8;
	or.b64  	%rd199, %rd198, %rd197;
	add.s64 	%rd196, %rd44, %rd199;
	// begin inline asm
	ld.global.nc.u8 %r308, [%rd196];
	// end inline asm
	cvt.u16.u32 	%rs111, %r308;
	and.b16  	%rs97, %rs111, 255;
	setp.lt.u16 	%p52, %rs97, 240;
	@%p52 bra 	$L__BB0_78;

	shl.b32 	%r314, %r308, 8;
	and.b32  	%r315, %r314, 65280;
	mov.u32 	%r316, 65536;
	sub.s32 	%r317, %r316, %r315;
	cvt.u64.u32 	%rd201, %r317;
	shr.u64 	%rd202, %rd42, 32;
	and.b64  	%rd203, %rd202, 255;
	or.b64  	%rd204, %rd203, %rd201;
	add.s64 	%rd200, %rd44, %rd204;
	// begin inline asm
	ld.global.nc.u8 %r313, [%rd200];
	// end inline asm
	cvt.u16.u32 	%rs111, %r313;

$L__BB0_78:
	add.s32 	%r319, %r356, 1;
	cvt.u64.u32 	%rd206, %r319;
	add.s64 	%rd207, %rd1, %rd206;
	ld.global.nc.u8 	%rs98, [%rd207];
	and.b16  	%rs99, %rs98, 128;
	and.b16  	%rs100, %rs111, 254;
	shr.u16 	%rs101, %rs100, 1;
	or.b16  	%rs102, %rs99, %rs101;
	mul.wide.u16 	%r320, %rs102, 256;
	and.b16  	%rs103, %rs98, 127;
	cvt.u32.u16 	%r321, %rs103;
	cvt.u32.u16 	%r322, %rs111;
	bfi.b32 	%r323, %r322, %r321, 7, 9;
	and.b32  	%r324, %r323, 255;
	or.b32  	%r325, %r68, %r324;
	or.b32  	%r357, %r325, %r320;
	add.s32 	%r338, %r299, 3;
	st.volatile.shared.u16 	[%r338+2], %r357;
	and.b32  	%r326, %r322, 255;
	add.s32 	%r327, %r16, %r326;
	cvt.s64.s32 	%rd208, %r327;
	add.s64 	%rd205, %rd44, %rd208;
	// begin inline asm
	ld.global.nc.u8 %r318, [%rd205];
	// end inline asm
	and.b32  	%r328, %r318, 255;
	shl.b64 	%rd225, %rd42, %r328;
	add.s32 	%r355, %r355, 4;
	add.s32 	%r356, %r356, 2;
	setp.lt.u32 	%p53, %r356, %r45;
	@%p53 bra 	$L__BB0_70;

$L__BB0_79:
	bar.sync 	0;
	ld.volatile.shared.u32 	%r329, [%r17];
	sub.s32 	%r330, %r329, %r43;
	sub.s32 	%r76, %r81, %r43;
	min.u32 	%r331, %r330, %r76;
	setp.ge.u32 	%p54, %r358, %r331;
	@%p54 bra 	$L__BB0_81;

$L__BB0_80:
	shl.b32 	%r332, %r358, 1;
	add.s32 	%r333, %r17, %r332;
	ld.volatile.shared.u16 	%rs104, [%r333+4];
	add.s32 	%r334, %r358, %r43;
	mul.wide.u32 	%rd209, %r334, 2;
	add.s64 	%rd210, %rd24, %rd209;
	st.global.u16 	[%rd210], %rs104;
	ld.volatile.shared.u32 	%r335, [%r17];
	sub.s32 	%r336, %r335, %r43;
	min.u32 	%r337, %r336, %r76;
	add.s32 	%r358, %r358, %r1;
	setp.lt.u32 	%p55, %r358, %r337;
	@%p55 bra 	$L__BB0_80;

$L__BB0_81:
	ret;

}

