@@ -681,6 +681,35 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
681
681
schedule = kmp_sch_guided_iterative_chunked;
682
682
KMP_WARNING (DispatchManyThreads);
683
683
}
684
+ if (schedule == kmp_sch_runtime_simd) {
685
+ // compiler provides simd_width in the chunk parameter
686
+ schedule = team->t .t_sched .r_sched_type ;
687
+ // Detail the schedule if needed (global controls are differentiated
688
+ // appropriately)
689
+ if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
690
+ schedule == __kmp_static) {
691
+ schedule = kmp_sch_static_balanced_chunked;
692
+ } else {
693
+ if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
694
+ schedule = kmp_sch_guided_simd;
695
+ }
696
+ chunk = team->t .t_sched .chunk * chunk;
697
+ }
698
+ #if USE_ITT_BUILD
699
+ cur_chunk = chunk;
700
+ #endif
701
+ #ifdef KMP_DEBUG
702
+ {
703
+ const char *buff;
704
+ // create format specifiers before the debug output
705
+ buff = __kmp_str_format (" __kmp_dispatch_init: T#%%d new: schedule:%%d"
706
+ " chunk:%%%s\n " ,
707
+ traits_t <ST>::spec);
708
+ KD_TRACE (10 , (buff, gtid, schedule, chunk));
709
+ __kmp_str_free (&buff);
710
+ }
711
+ #endif
712
+ }
684
713
pr->u .p .parm1 = chunk;
685
714
}
686
715
KMP_ASSERT2 ((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
@@ -878,7 +907,21 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
878
907
}
879
908
break ;
880
909
} // case
881
- case kmp_sch_guided_iterative_chunked: {
910
+ case kmp_sch_static_balanced_chunked: {
911
+ // similar to balanced, but chunk adjusted to multiple of simd width
912
+ T nth = th->th .th_team_nproc ;
913
+ KD_TRACE (100 , (" __kmp_dispatch_init: T#%d runtime(simd:static)"
914
+ " -> falling-through to static_greedy\n " ,
915
+ gtid));
916
+ schedule = kmp_sch_static_greedy;
917
+ if (nth > 1 )
918
+ pr->u .p .parm1 = ((tc + nth - 1 ) / nth + chunk - 1 ) & ~(chunk - 1 );
919
+ else
920
+ pr->u .p .parm1 = tc;
921
+ break ;
922
+ } // case
923
+ case kmp_sch_guided_iterative_chunked:
924
+ case kmp_sch_guided_simd: {
882
925
T nproc = th->th .th_team_nproc ;
883
926
KD_TRACE (100 , (" __kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
884
927
" case\n " ,
@@ -1140,6 +1183,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
1140
1183
break ;
1141
1184
case kmp_sch_guided_iterative_chunked:
1142
1185
case kmp_sch_guided_analytical_chunked:
1186
+ case kmp_sch_guided_simd:
1143
1187
schedtype = 2 ;
1144
1188
break ;
1145
1189
default :
@@ -1991,6 +2035,89 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1991
2035
} // case
1992
2036
break ;
1993
2037
2038
+ case kmp_sch_guided_simd: {
2039
+ // same as iterative but curr-chunk adjusted to be multiple of given
2040
+ // chunk
2041
+ T chunk = pr->u .p .parm1 ;
2042
+ KD_TRACE (100 , (" __kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n " ,
2043
+ gtid));
2044
+ trip = pr->u .p .tc ;
2045
+ // Start atomic part of calculations
2046
+ while (1 ) {
2047
+ ST remaining; // signed, because can be < 0
2048
+ init = sh->u .s .iteration ; // shared value
2049
+ remaining = trip - init;
2050
+ if (remaining <= 0 ) { // AC: need to compare with 0 first
2051
+ status = 0 ; // nothing to do, don't try atomic op
2052
+ break ;
2053
+ }
2054
+ KMP_DEBUG_ASSERT (init % chunk == 0 );
2055
+ // compare with K*nproc*(chunk+1), K=2 by default
2056
+ if ((T)remaining < pr->u .p .parm2 ) {
2057
+ // use dynamic-style shcedule
2058
+ // atomically inrement iterations, get old value
2059
+ init = test_then_add<ST>((ST *)&sh->u .s .iteration , (ST)chunk);
2060
+ remaining = trip - init;
2061
+ if (remaining <= 0 ) {
2062
+ status = 0 ; // all iterations got by other threads
2063
+ } else {
2064
+ // got some iterations to work on
2065
+ status = 1 ;
2066
+ if ((T)remaining > chunk) {
2067
+ limit = init + chunk - 1 ;
2068
+ } else {
2069
+ last = 1 ; // the last chunk
2070
+ limit = init + remaining - 1 ;
2071
+ } // if
2072
+ } // if
2073
+ break ;
2074
+ } // if
2075
+ // divide by K*nproc
2076
+ UT span = remaining * (*(double *)&pr->u .p .parm3 );
2077
+ UT rem = span % chunk;
2078
+ if (rem) // adjust so that span%chunk == 0
2079
+ span += chunk - rem;
2080
+ limit = init + span;
2081
+ if (compare_and_swap<ST>((ST *)&sh->u .s .iteration , (ST)init,
2082
+ (ST)limit)) {
2083
+ // CAS was successful, chunk obtained
2084
+ status = 1 ;
2085
+ --limit;
2086
+ break ;
2087
+ } // if
2088
+ } // while
2089
+ if (status != 0 ) {
2090
+ start = pr->u .p .lb ;
2091
+ incr = pr->u .p .st ;
2092
+ if (p_st != NULL )
2093
+ *p_st = incr;
2094
+ *p_lb = start + init * incr;
2095
+ *p_ub = start + limit * incr;
2096
+ if (pr->ordered ) {
2097
+ pr->u .p .ordered_lower = init;
2098
+ pr->u .p .ordered_upper = limit;
2099
+ #ifdef KMP_DEBUG
2100
+ {
2101
+ const char *buff;
2102
+ // create format specifiers before the debug output
2103
+ buff = __kmp_str_format (" __kmp_dispatch_next: T#%%d "
2104
+ " ordered_lower:%%%s ordered_upper:%%%s\n " ,
2105
+ traits_t <UT>::spec, traits_t <UT>::spec);
2106
+ KD_TRACE (1000 , (buff, gtid, pr->u .p .ordered_lower ,
2107
+ pr->u .p .ordered_upper ));
2108
+ __kmp_str_free (&buff);
2109
+ }
2110
+ #endif
2111
+ } // if
2112
+ } else {
2113
+ *p_lb = 0 ;
2114
+ *p_ub = 0 ;
2115
+ if (p_st != NULL )
2116
+ *p_st = 0 ;
2117
+ } // if
2118
+ } // case
2119
+ break ;
2120
+
1994
2121
case kmp_sch_guided_analytical_chunked: {
1995
2122
T chunkspec = pr->u .p .parm1 ;
1996
2123
UT chunkIdx;
0 commit comments