@@ -479,6 +479,293 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479
479
return ret;
480
480
}
481
481
482
+ #if defined(__cplusplus) && (__cplusplus >= 201103L)
483
+
484
+ #pragma push_macro("__INTRINSIC_LOAD")
485
+ #define __INTRINSIC_LOAD (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
486
+ __Clobber) \
487
+ inline __device__ __DeclType __FnName (const __DeclType *__ptr) { \
488
+ __TmpType __ret; \
489
+ asm (__AsmOp " %0, [%1];" : __AsmType (__ret) : " l" (__ptr)__Clobber); \
490
+ return (__DeclType)__ret; \
491
+ }
492
+
493
+ #pragma push_macro("__INTRINSIC_LOAD2")
494
+ #define __INTRINSIC_LOAD2 (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
495
+ __Clobber) \
496
+ inline __device__ __DeclType __FnName (const __DeclType *__ptr) { \
497
+ __DeclType __ret; \
498
+ __TmpType __tmp; \
499
+ asm (__AsmOp " {%0,%1}, [%2];" \
500
+ : __AsmType (__tmp.x ), __AsmType (__tmp.y ) \
501
+ : " l" (__ptr)__Clobber); \
502
+ using __ElementType = decltype (__ret.x ); \
503
+ __ret.x = (__ElementType)(__tmp.x ); \
504
+ __ret.y = (__ElementType)__tmp.y ; \
505
+ return __ret; \
506
+ }
507
+
508
+ #pragma push_macro("__INTRINSIC_LOAD4")
509
+ #define __INTRINSIC_LOAD4 (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
510
+ __Clobber) \
511
+ inline __device__ __DeclType __FnName (const __DeclType *__ptr) { \
512
+ __DeclType __ret; \
513
+ __TmpType __tmp; \
514
+ asm (__AsmOp " {%0,%1,%2,%3}, [%4];" \
515
+ : __AsmType (__tmp.x ), __AsmType (__tmp.y ), __AsmType (__tmp.z ), \
516
+ __AsmType (__tmp.w ) \
517
+ : " l" (__ptr)__Clobber); \
518
+ using __ElementType = decltype (__ret.x ); \
519
+ __ret.x = (__ElementType)__tmp.x ; \
520
+ __ret.y = (__ElementType)__tmp.y ; \
521
+ __ret.z = (__ElementType)__tmp.z ; \
522
+ __ret.w = (__ElementType)__tmp.w ; \
523
+ return __ret; \
524
+ }
525
+
526
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s8" , char , unsigned int , " =r" , );
527
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s8" , signed char , unsigned int , " =r" , );
528
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s16" , short , unsigned short , " =h" , );
529
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s32" , int , unsigned int , " =r" , );
530
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s64" , long long , unsigned long long ,
531
+ " =l" , );
532
+
533
+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s8" , char2, int2, " =r" , );
534
+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.s8" , char4, int4, " =r" , );
535
+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s16" , short2, short2, " =h" , );
536
+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.s16" , short4, short4, " =h" , );
537
+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s32" , int2, int2, " =r" , );
538
+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.s32" , int4, int4, " =r" , );
539
+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s64 " , longlong2, longlong2, " =l" , );
540
+
541
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u8" , unsigned char , unsigned int ,
542
+ " =r" , );
543
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u16" , unsigned short , unsigned short ,
544
+ " =h" , );
545
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u32" , unsigned int , unsigned int ,
546
+ " =r" , );
547
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u64" , unsigned long long ,
548
+ unsigned long long , " =l" , );
549
+
550
+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u8" , uchar2, int2, " =r" , );
551
+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.u8" , uchar4, int4, " =r" , );
552
+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u16" , ushort2, ushort2, " =h" , );
553
+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.u16" , ushort4, ushort4, " =h" , );
554
+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u32" , uint2, uint2, " =r" , );
555
+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.u32" , uint4, uint4, " =r" , );
556
+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u64" , ulonglong2, ulonglong2,
557
+ " =l" , );
558
+
559
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.f32" , float , float , " =f" , );
560
+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.f64" , double , double , " =d" , );
561
+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.f32" , float2, float2, " =f" , );
562
+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.f32" , float4, float4, " =f" , );
563
+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.f64" , double2, double2, " =d" , );
564
+
565
+ inline __device__ long __ldcg (const long *__ptr) {
566
+ unsigned long __ret;
567
+ if (sizeof (long ) == 8 ) {
568
+ asm (" ld.global.cg.s64 %0, [%1];" : " =l" (__ret) : " l" (__ptr));
569
+ } else {
570
+ asm (" ld.global.cg.s32 %0, [%1];" : " =r" (__ret) : " l" (__ptr));
571
+ }
572
+ return (long )__ret;
573
+ }
574
+
575
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u8" , unsigned char , unsigned int ,
576
+ " =r" , : " memory" );
577
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u16" , unsigned short , unsigned short ,
578
+ " =h" , : " memory" );
579
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u32" , unsigned int , unsigned int ,
580
+ " =r" , : " memory" );
581
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u64" , unsigned long long ,
582
+ unsigned long long , " =l" , : " memory" );
583
+
584
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s8" , char , unsigned int ,
585
+ " =r" , : " memory" );
586
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s8" , signed char , unsigned int ,
587
+ " =r" , : " memory" );
588
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s16" , short , unsigned short ,
589
+ " =h" , : " memory" );
590
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s32" , int , unsigned int ,
591
+ " =r" , : " memory" );
592
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s64" , long long , unsigned long long ,
593
+ " =l" , : " memory" );
594
+
595
+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u8" , uchar2, uint2,
596
+ " =r" , : " memory" );
597
+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.u8" , uchar4, uint4,
598
+ " =r" , : " memory" );
599
+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u16" , ushort2, ushort2,
600
+ " =h" , : " memory" );
601
+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.u16" , ushort4, ushort4,
602
+ " =h" , : " memory" );
603
+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u32" , uint2, uint2,
604
+ " =r" , : " memory" );
605
+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.u32" , uint4, uint4,
606
+ " =r" , : " memory" );
607
+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u64" , ulonglong2, ulonglong2,
608
+ " =l" , : " memory" );
609
+
610
+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s8" , char2, int2, " =r" , : " memory" );
611
+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.s8" , char4, int4, " =r" , : " memory" );
612
+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s16" , short2, short2,
613
+ " =h" , : " memory" );
614
+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.s16" , short4, short4,
615
+ " =h" , : " memory" );
616
+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s32" , int2, int2, " =r" , : " memory" );
617
+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.s32" , int4, int4, " =r" , : " memory" );
618
+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s64" , longlong2, longlong2,
619
+ " =l" , : " memory" );
620
+
621
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.f32" , float , float , " =f" , : " memory" );
622
+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.f64" , double , double , " =d" , : " memory" );
623
+
624
+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.f32" , float2, float2,
625
+ " =f" , : " memory" );
626
+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.f32" , float4, float4,
627
+ " =f" , : " memory" );
628
+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.f64" , double2, double2,
629
+ " =d" , : " memory" );
630
+
631
+ inline __device__ long __ldcv (const long *__ptr) {
632
+ unsigned long __ret;
633
+ if (sizeof (long ) == 8 ) {
634
+ asm (" ld.global.cv.s64 %0, [%1];" : " =l" (__ret) : " l" (__ptr));
635
+ } else {
636
+ asm (" ld.global.cv.s32 %0, [%1];" : " =r" (__ret) : " l" (__ptr));
637
+ }
638
+ return (long )__ret;
639
+ }
640
+
641
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s8" , char , unsigned int , " =r" , );
642
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s8" , signed char , signed int , " =r" , );
643
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s16" , short , unsigned short , " =h" , );
644
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s32" , int , unsigned int , " =r" , );
645
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s64" , long long , unsigned long long ,
646
+ " =l" , );
647
+
648
+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s8" , char2, int2, " =r" , );
649
+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.s8" , char4, int4, " =r" , );
650
+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s16" , short2, short2, " =h" , );
651
+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.s16" , short4, short4, " =h" , );
652
+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s32" , int2, int2, " =r" , );
653
+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.s32" , int4, int4, " =r" , );
654
+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s64" , longlong2, longlong2, " =l" , );
655
+
656
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u8" , unsigned char , unsigned int ,
657
+ " =r" , );
658
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u16" , unsigned short , unsigned short ,
659
+ " =h" , );
660
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u32" , unsigned int , unsigned int ,
661
+ " =r" , );
662
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u64" , unsigned long long ,
663
+ unsigned long long , " =l" , );
664
+
665
+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u8" , uchar2, uint2, " =r" , );
666
+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.u8" , uchar4, uint4, " =r" , );
667
+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u16" , ushort2, ushort2, " =h" , );
668
+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.u16" , ushort4, ushort4, " =h" , );
669
+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u32" , uint2, uint2, " =r" , );
670
+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.u32" , uint4, uint4, " =r" , );
671
+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u64" , ulonglong2, ulonglong2,
672
+ " =l" , );
673
+
674
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.f32" , float , float , " =f" , );
675
+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.f64" , double , double , " =d" , );
676
+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.f32" , float2, float2, " =f" , );
677
+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.f32" , float4, float4, " =f" , );
678
+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.f64" , double2, double2, " =d" , );
679
+
680
+ #pragma pop_macro("__INTRINSIC_LOAD")
681
+ #pragma pop_macro("__INTRINSIC_LOAD2")
682
+ #pragma pop_macro("__INTRINSIC_LOAD4")
683
+
684
+ inline __device__ long __ldcs (const long *__ptr) {
685
+ unsigned long __ret;
686
+ if (sizeof (long ) == 8 ) {
687
+ asm (" ld.global.cs.s64 %0, [%1];" : " =l" (__ret) : " l" (__ptr));
688
+ } else {
689
+ asm (" ld.global.cs.s32 %0, [%1];" : " =r" (__ret) : " l" (__ptr));
690
+ }
691
+ return (long )__ret;
692
+ }
693
+
694
+ #pragma push_macro("__INTRINSIC_STORE")
695
+ #define __INTRINSIC_STORE (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType ) \
696
+ inline __device__ void __FnName (__DeclType *__ptr, __DeclType __value) { \
697
+ __TmpType __tmp = (__TmpType)__value; \
698
+ asm (__AsmOp " [%0], %1;" ::" l" (__ptr), __AsmType (__tmp) : " memory" ); \
699
+ }
700
+
701
+ #pragma push_macro("__INTRINSIC_STORE2")
702
+ #define __INTRINSIC_STORE2 (__FnName, __AsmOp, __DeclType, __TmpType, \
703
+ __AsmType) \
704
+ inline __device__ void __FnName (__DeclType *__ptr, __DeclType __value) { \
705
+ __TmpType __tmp; \
706
+ using __ElementType = decltype (__tmp.x ); \
707
+ __tmp.x = (__ElementType)(__value.x ); \
708
+ __tmp.y = (__ElementType)(__value.y ); \
709
+ asm (__AsmOp " [%0], {%1,%2};" ::" l" (__ptr), __AsmType (__tmp.x ), \
710
+ __AsmType (__tmp.y ) \
711
+ : " memory" ); \
712
+ }
713
+
714
+ #pragma push_macro("__INTRINSIC_STORE4")
715
+ #define __INTRINSIC_STORE4 (__FnName, __AsmOp, __DeclType, __TmpType, \
716
+ __AsmType) \
717
+ inline __device__ void __FnName (__DeclType *__ptr, __DeclType __value) { \
718
+ __TmpType __tmp; \
719
+ using __ElementType = decltype (__tmp.x ); \
720
+ __tmp.x = (__ElementType)(__value.x ); \
721
+ __tmp.y = (__ElementType)(__value.y ); \
722
+ __tmp.z = (__ElementType)(__value.z ); \
723
+ __tmp.w = (__ElementType)(__value.w ); \
724
+ asm (__AsmOp " [%0], {%1,%2,%3,%4};" ::" l" (__ptr), __AsmType (__tmp.x ), \
725
+ __AsmType (__tmp.y ), __AsmType (__tmp.z ), __AsmType (__tmp.w ) \
726
+ : " memory" ); \
727
+ }
728
+
729
+ __INTRINSIC_STORE (__stwt, " st.global.wt.s8" , char , int , " r" );
730
+ __INTRINSIC_STORE (__stwt, " st.global.wt.s8" , signed char , int , " r" );
731
+ __INTRINSIC_STORE (__stwt, " st.global.wt.s16" , short , short , " h" );
732
+ __INTRINSIC_STORE (__stwt, " st.global.wt.s32" , int , int , " r" );
733
+ __INTRINSIC_STORE (__stwt, " st.global.wt.s64" , long long , long long , " l" );
734
+
735
+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s8" , char2, int2, " r" );
736
+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.s8" , char4, int4, " r" );
737
+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s16" , short2, short2, " h" );
738
+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.s16" , short4, short4, " h" );
739
+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s32" , int2, int2, " r" );
740
+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.s32" , int4, int4, " r" );
741
+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s64" , longlong2, longlong2, " l" );
742
+
743
+ __INTRINSIC_STORE (__stwt, " st.global.wt.u8" , unsigned char , int , " r" );
744
+ __INTRINSIC_STORE (__stwt, " st.global.wt.u16" , unsigned short , unsigned short ,
745
+ " h" );
746
+ __INTRINSIC_STORE (__stwt, " st.global.wt.u32" , unsigned int , unsigned int , " r" );
747
+ __INTRINSIC_STORE (__stwt, " st.global.wt.u64" , unsigned long long ,
748
+ unsigned long long , " l" );
749
+
750
+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u8" , uchar2, uchar2, " r" );
751
+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.u8" , uchar4, uint4, " r" );
752
+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u16" , ushort2, ushort2, " h" );
753
+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.u16" , ushort4, ushort4, " h" );
754
+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u32" , uint2, uint2, " r" );
755
+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.u32" , uint4, uint4, " r" );
756
+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u64" , ulonglong2, ulonglong2, " l" );
757
+
758
+ __INTRINSIC_STORE (__stwt, " st.global.wt.f32" , float , float , " f" );
759
+ __INTRINSIC_STORE (__stwt, " st.global.wt.f64" , double , double , " d" );
760
+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.f32" , float2, float2, " f" );
761
+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.f32" , float4, float4, " f" );
762
+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.f64" , double2, double2, " d" );
763
+
764
+ #pragma pop_macro("__INTRINSIC_STORE")
765
+ #pragma pop_macro("__INTRINSIC_STORE2")
766
+ #pragma pop_macro("__INTRINSIC_STORE4")
767
+
768
+ #endif // defined(__cplusplus) && (__cplusplus >= 201103L)
482
769
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
483
770
484
771
#if CUDA_VERSION >= 11000
0 commit comments