@@ -479,6 +479,275 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479
479
return ret ;
480
480
}
481
481
482
+ #define INTRINSIC_LOAD (func_name , asm_op , decl_type , internal_type , asm_type ) \
483
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
484
+ internal_type ret; \
485
+ asm(asm_op" %0, [%1];" : asm_type(ret) : "l"(ptr)); \
486
+ return (decl_type)ret; \
487
+ }
488
+
489
+ #define INTRINSIC_LOAD2 (func_name , asm_op , decl_type , internal_type , asm_type ) \
490
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
491
+ decl_type ret; \
492
+ internal_type tmp; \
493
+ asm(asm_op" {%0,%1}, [%2];" \
494
+ : asm_type(tmp.x), asm_type(tmp.y) \
495
+ : "l"(ptr)); \
496
+ using element_type = decltype(ret.x); \
497
+ ret.x = (element_type)(tmp.x); \
498
+ ret.y = (element_type)tmp.y; \
499
+ return ret; \
500
+ }
501
+
502
+ #define INTRINSIC_LOAD4 (func_name , asm_op , decl_type , internal_type , asm_type ) \
503
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
504
+ decl_type ret; \
505
+ internal_type tmp; \
506
+ asm(asm_op" {%0,%1,%2,%3}, [%4];" \
507
+ : asm_type(tmp.x), asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
508
+ : "l"(ptr)); \
509
+ using element_type = decltype(ret.x); \
510
+ ret.x = (element_type)tmp.x; \
511
+ ret.y = (element_type)tmp.y; \
512
+ ret.z = (element_type)tmp.z; \
513
+ ret.w = (element_type)tmp.w; \
514
+ return ret; \
515
+ }
516
+
517
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , char , unsigned int , "=r" );
518
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , signed char , unsigned int , "=r" );
519
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.s16" , short , unsigned short , "=h" );
520
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.s32" , int , unsigned int , "=r" );
521
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.s64" , long long , unsigned long long, "=l" );
522
+
523
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s8" , char2 , int2 , "=r" );
524
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s8" , char4 , int4 , "=r" );
525
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s16" , short2 , short2 , "=h" );
526
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s16" , short4 , short4 , "=h" );
527
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s32" , int2 , int2 , "=r" );
528
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s32" , int4 , int4 , "=r" );
529
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s64 " , longlong2 , longlong2 , "=l" );
530
+
531
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.u8" , unsigned char , unsigned int , "=r" );
532
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.u16" , unsigned short , unsigned short , "=h" );
533
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.u32" , unsigned int , unsigned int , "=r" );
534
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.u64" , unsigned long long, unsigned long long, "=l" );
535
+
536
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u8" , uchar2 , int2 , "=r" );
537
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u8" , uchar4 , int4 , "=r" );
538
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u16" , ushort2 , ushort2 , "=h" );
539
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u16" , ushort4 , ushort4 , "=h" );
540
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u32" , uint2 , uint2 , "=r" );
541
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u32" , uint4 , uint4 , "=r" );
542
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u64" , ulonglong2 , ulonglong2 , "=l" );
543
+
544
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.f32" , float , float , "=f" );
545
+ INTRINSIC_LOAD (__ldcg , "ld.global.cg.f64" , double , double , "=d" );
546
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f32" , float2 , float2 , "=f" );
547
+ INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.f32" , float4 , float4 , "=f" );
548
+ INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f64" , double2 , double2 , "=d" );
549
+
550
+ inline __device__ long __ldcg (const long * ptr ) {
551
+ unsigned long ret ;
552
+ if (sizeof (long ) == 8 ) {
553
+ asm("ld.global.cg.s64 %0, [%1];" : "=l" (ret ) : "l" (ptr ));
554
+ } else {
555
+ asm("ld.global.cg.s32 %0, [%1];" : "=r" (ret ) : "l" (ptr ));
556
+ }
557
+ return (long )ret ;
558
+ }
559
+
560
+ #define MINTRINSIC_LOAD (func_name , asm_op , decl_type , internal_type , asm_type ) \
561
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
562
+ internal_type ret; \
563
+ asm(asm_op" %0, [%1];" : asm_type(ret) : "l"(ptr) : "memory"); \
564
+ return (decl_type)ret; \
565
+ }
566
+
567
+ #define MINTRINSIC_LOAD2 (func_name , asm_op , decl_type , internal_type , asm_type ) \
568
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
569
+ decl_type ret; \
570
+ internal_type tmp; \
571
+ asm(asm_op" {%0,%1}, [%2];" \
572
+ : asm_type(tmp.x), asm_type(tmp.y) \
573
+ : "l"(ptr) : "memory"); \
574
+ using element_type = decltype(ret.x); \
575
+ ret.x = (element_type)tmp.x; \
576
+ ret.y = (element_type)tmp.y; \
577
+ return ret; \
578
+ }
579
+
580
+ #define MINTRINSIC_LOAD4 (func_name , asm_op , decl_type , internal_type , asm_type ) \
581
+ inline __device__ decl_type func_name(const decl_type *ptr) { \
582
+ decl_type ret; \
583
+ internal_type tmp; \
584
+ asm(asm_op" {%0,%1,%2,%3}, [%4];" \
585
+ : asm_type(tmp.x), asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
586
+ : "l"(ptr) : "memory"); \
587
+ using element_type = decltype(ret.x); \
588
+ ret.x = (element_type)tmp.x; \
589
+ ret.y = (element_type)tmp.y; \
590
+ ret.z = (element_type)tmp.z; \
591
+ ret.w = (element_type)tmp.w; \
592
+ return ret; \
593
+ }
594
+
595
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.u8" , unsigned char , unsigned int , "=r" );
596
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.u16" , unsigned short , unsigned short ,
597
+ "=h" );
598
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.u32" , unsigned int , unsigned int , "=r" );
599
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.u64" , unsigned long long,
600
+ unsigned long long, "=l" );
601
+
602
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.s8" , char , unsigned int , "=r" );
603
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.s8" , signed char , unsigned int , "=r" );
604
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.s16" , short , unsigned short , "=h" );
605
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.s32" , int , unsigned int , "=r" );
606
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.s64" , long long , unsigned long long,
607
+ "=l" );
608
+
609
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u8" , uchar2 , uint2 , "=r" );
610
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u8" , uchar4 , uint4 , "=r" );
611
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u16" , ushort2 , ushort2 , "=h" );
612
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u16" , ushort4 , ushort4 , "=h" );
613
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u32" , uint2 , uint2 , "=r" );
614
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u32" , uint4 , uint4 , "=r" );
615
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u64" , ulonglong2 , ulonglong2 , "=l" );
616
+
617
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s8" , char2 , int2 , "=r" );
618
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s8" , char4 , int4 , "=r" );
619
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s16" , short2 , short2 , "=h" );
620
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s16" , short4 , short4 , "=h" );
621
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s32" , int2 , int2 , "=r" );
622
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s32" , int4 , int4 , "=r" );
623
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s64" , longlong2 , longlong2 , "=l" );
624
+
625
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.f32" , float , float , "=f" );
626
+ MINTRINSIC_LOAD (__ldcv , "ld.global.cv.f64" , double , double , "=d" );
627
+
628
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f32" , float2 , float2 , "=f" );
629
+ MINTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.f32" , float4 , float4 , "=f" );
630
+ MINTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f64" , double2 , double2 , "=d" );
631
+
632
+ inline __device__ long __ldcv (const long * ptr ) {
633
+ unsigned long ret ;
634
+ if (sizeof (long ) == 8 ) {
635
+ asm("ld.global.cv.s64 %0, [%1];" : "=l" (ret ) : "l" (ptr ));
636
+ } else {
637
+ asm("ld.global.cv.s32 %0, [%1];" : "=r" (ret ) : "l" (ptr ));
638
+ }
639
+ return (long )ret ;
640
+ }
641
+
642
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , char , unsigned int , "=r" );
643
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , signed char , signed int , "=r" );
644
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.s16" , short , unsigned short , "=h" );
645
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.s32" , int , unsigned int , "=r" );
646
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.s64" , long long , unsigned long long, "=l" );
647
+
648
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s8" , char2 , int2 , "=r" );
649
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s8" , char4 , int4 , "=r" );
650
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s16" , short2 , short2 , "=h" );
651
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s16" , short4 , short4 , "=h" );
652
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s32" , int2 , int2 , "=r" );
653
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s32" , int4 , int4 , "=r" );
654
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s64" , longlong2 , longlong2 , "=l" );
655
+
656
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.u8" , unsigned char , unsigned int , "=r" );
657
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.u16" , unsigned short , unsigned short ,
658
+ "=h" );
659
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.u32" , unsigned int , unsigned int , "=r" );
660
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.u64" , unsigned long long,
661
+ unsigned long long, "=l" );
662
+
663
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u8" , uchar2 , uint2 , "=r" );
664
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u8" , uchar4 , uint4 , "=r" );
665
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u16" , ushort2 , ushort2 , "=h" );
666
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u16" , ushort4 , ushort4 , "=h" );
667
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u32" , uint2 , uint2 , "=r" );
668
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u32" , uint4 , uint4 , "=r" );
669
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u64" , ulonglong2 , ulonglong2 , "=l" );
670
+
671
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.f32" , float , float , "=f" );
672
+ INTRINSIC_LOAD (__ldcs , "ld.global.cs.f64" , double , double , "=d" );
673
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f32" , float2 , float2 , "=f" );
674
+ INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.f32" , float4 , float4 , "=f" );
675
+ INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f64" , double2 , double2 , "=d" );
676
+
677
+ inline __device__ long __ldcs (const long * ptr ) {
678
+ unsigned long ret ;
679
+ if (sizeof (long ) == 8 ) {
680
+ asm("ld.global.cs.s64 %0, [%1];" : "=l" (ret ) : "l" (ptr ));
681
+ } else {
682
+ asm("ld.global.cs.s32 %0, [%1];" : "=r" (ret ) : "l" (ptr ));
683
+ }
684
+ return (long )ret ;
685
+ }
686
+
687
+ #define INTRINSIC_STORE (func_name , asm_op , decl_type , internal_type , asm_type ) \
688
+ inline __device__ void func_name(decl_type *ptr, decl_type value) { \
689
+ internal_type tmp = (internal_type)value; \
690
+ asm(asm_op" [%0], %1;" ::"l"(ptr), asm_type(tmp) : "memory"); \
691
+ }
692
+
693
+ #define INTRINSIC_STORE2 (func_name , asm_op , decl_type , internal_type , asm_type ) \
694
+ inline __device__ void func_name(decl_type *ptr, decl_type value) { \
695
+ internal_type tmp; \
696
+ using element_type = decltype(tmp.x); \
697
+ tmp.x = (element_type)(value.x); \
698
+ tmp.y = (element_type)(value.y); \
699
+ asm(asm_op" [%0], {%1,%2};" ::"l"(ptr), asm_type(tmp.x), asm_type(tmp.y) \
700
+ : "memory"); \
701
+ }
702
+
703
+ #define INTRINSIC_STORE4 (func_name , asm_op , decl_type , internal_type , asm_type ) \
704
+ inline __device__ void func_name(decl_type *ptr, decl_type value) { \
705
+ internal_type tmp; \
706
+ using element_type = decltype(tmp.x); \
707
+ tmp.x = (element_type)(value.x); \
708
+ tmp.y = (element_type)(value.y); \
709
+ tmp.z = (element_type)(value.z); \
710
+ tmp.w = (element_type)(value.w); \
711
+ asm(asm_op" [%0], {%1,%2,%3,%4};" ::"l"(ptr), asm_type(tmp.x), \
712
+ asm_type(tmp.y), asm_type(tmp.z), asm_type(tmp.w) \
713
+ : "memory"); \
714
+ }
715
+
716
+ INTRINSIC_STORE (__stwt , "st.global.wt.s8" , char , int , "r" );
717
+ INTRINSIC_STORE (__stwt , "st.global.wt.s8" , signed char , int , "r" );
718
+ INTRINSIC_STORE (__stwt , "st.global.wt.s16" , short , short , "h" );
719
+ INTRINSIC_STORE (__stwt , "st.global.wt.s32" , int , int , "r" );
720
+ INTRINSIC_STORE (__stwt , "st.global.wt.s64" , long long , long long , "l" );
721
+
722
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s8" , char2 , int2 , "r" );
723
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s8" , char4 , int4 , "r" );
724
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s16" , short2 , short2 , "h" );
725
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s16" , short4 , short4 , "h" );
726
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s32" , int2 , int2 , "r" );
727
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s32" , int4 , int4 , "r" );
728
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s64" , longlong2 , longlong2 , "l" );
729
+
730
+ INTRINSIC_STORE (__stwt , "st.global.wt.u8" , unsigned char , int , "r" );
731
+ INTRINSIC_STORE (__stwt , "st.global.wt.u16" , unsigned short , unsigned short ,
732
+ "h" );
733
+ INTRINSIC_STORE (__stwt , "st.global.wt.u32" , unsigned int , unsigned int , "r" );
734
+ INTRINSIC_STORE (__stwt , "st.global.wt.u64" , unsigned long long,
735
+ unsigned long long, "l" );
736
+
737
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u8" , uchar2 , uchar2 , "r" );
738
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u8" , uchar4 , uint4 , "r" );
739
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u16" , ushort2 , ushort2 , "h" );
740
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u16" , ushort4 , ushort4 , "h" );
741
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u32" , uint2 , uint2 , "r" );
742
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u32" , uint4 , uint4 , "r" );
743
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u64" , ulonglong2 , ulonglong2 , "l" );
744
+
745
+ INTRINSIC_STORE (__stwt , "st.global.wt.f32" , float , float , "f" );
746
+ INTRINSIC_STORE (__stwt , "st.global.wt.f64" , double , double , "d" );
747
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f32" , float2 , float2 , "f" );
748
+ INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.f32" , float4 , float4 , "f" );
749
+ INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f64" , double2 , double2 , "d" );
750
+
482
751
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
483
752
484
753
#if CUDA_VERSION >= 11000
0 commit comments