Skip to content

Commit 5cc603c

Browse files
authored
[OpenMP] Add skewed iteration distribution on hybrid systems (#69946)
This commit adds skewed distribution of iterations in nonmonotonic:dynamic schedule (static steal) for hybrid systems when thread affinity is assigned. Currently, it distributes the iterations at 60:40 ratio. Consider this loop with dynamic schedule type, for (int i = 0; i < 100; ++i). In a hybrid system with 20 hardware threads (16 CORE and 4 ATOM core), 88 iterations will be assigned to performance cores and 12 iterations will be assigned to efficient cores. Each thread with CORE core will process 5 iterations + extras and with ATOM core will process 3 iterations. Differential Revision: https://reviews.llvm.org/D152955
1 parent 3dff285 commit 5cc603c

File tree

6 files changed

+276
-56
lines changed

6 files changed

+276
-56
lines changed

openmp/runtime/src/kmp.h

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
#ifndef KMP_STATIC_STEAL_ENABLED
2828
#define KMP_STATIC_STEAL_ENABLED 1
2929
#endif
30+
#define KMP_WEIGHTED_ITERATIONS_SUPPORTED \
31+
(KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED && \
32+
(KMP_ARCH_X86 || KMP_ARCH_X86_64))
3033

3134
#define TASK_CURRENT_NOT_QUEUED 0
3235
#define TASK_CURRENT_QUEUED 1
@@ -881,14 +884,8 @@ typedef struct kmp_affinity_flags_t {
881884
KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4);
882885

883886
typedef struct kmp_affinity_ids_t {
887+
int os_id;
884888
int ids[KMP_HW_LAST];
885-
int operator[](size_t idx) const { return ids[idx]; }
886-
int &operator[](size_t idx) { return ids[idx]; }
887-
kmp_affinity_ids_t &operator=(const kmp_affinity_ids_t &rhs) {
888-
for (int i = 0; i < KMP_HW_LAST; ++i)
889-
ids[i] = rhs[i];
890-
return *this;
891-
}
892889
} kmp_affinity_ids_t;
893890

894891
typedef struct kmp_affinity_attrs_t {
@@ -938,6 +935,10 @@ extern kmp_affin_mask_t *__kmp_affin_fullMask;
938935
extern kmp_affin_mask_t *__kmp_affin_origMask;
939936
extern char *__kmp_cpuinfo_file;
940937

938+
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
939+
extern int __kmp_first_osid_with_ecore;
940+
#endif
941+
941942
#endif /* KMP_AFFINITY_SUPPORTED */
942943

943944
// This needs to be kept in sync with the values in omp.h !!!
@@ -1849,12 +1850,9 @@ typedef struct kmp_sched_flags {
18491850
unsigned ordered : 1;
18501851
unsigned nomerge : 1;
18511852
unsigned contains_last : 1;
1852-
#if KMP_USE_HIER_SCHED
1853-
unsigned use_hier : 1;
1854-
unsigned unused : 28;
1855-
#else
1856-
unsigned unused : 29;
1857-
#endif
1853+
unsigned use_hier : 1; // Used in KMP_USE_HIER_SCHED code
1854+
unsigned use_hybrid : 1; // Used in KMP_WEIGHTED_ITERATIONS_SUPPORTED code
1855+
unsigned unused : 27;
18581856
} kmp_sched_flags_t;
18591857

18601858
KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
@@ -1868,26 +1866,37 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
18681866
kmp_int32 st;
18691867
kmp_int32 tc;
18701868
kmp_lock_t *steal_lock; // lock used for chunk stealing
1869+
1870+
kmp_uint32 ordered_lower;
1871+
kmp_uint32 ordered_upper;
1872+
18711873
// KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
18721874
// a) parm3 is properly aligned and
18731875
// b) all parm1-4 are on the same cache line.
18741876
// Because of parm1-4 are used together, performance seems to be better
18751877
// if they are on the same cache line (not measured though).
18761878

1877-
struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
1878-
kmp_int32 parm1; // structures in kmp_dispatch.cpp. This should
1879-
kmp_int32 parm2; // make no real change at least while padding is off.
1879+
struct KMP_ALIGN(32) {
1880+
kmp_int32 parm1;
1881+
kmp_int32 parm2;
18801882
kmp_int32 parm3;
18811883
kmp_int32 parm4;
18821884
};
18831885

1884-
kmp_uint32 ordered_lower;
1885-
kmp_uint32 ordered_upper;
1886+
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
1887+
kmp_uint32 pchunks;
1888+
kmp_uint32 num_procs_with_pcore;
1889+
kmp_int32 first_thread_with_ecore;
1890+
#endif
18861891
#if KMP_OS_WINDOWS
18871892
kmp_int32 last_upper;
18881893
#endif /* KMP_OS_WINDOWS */
18891894
} dispatch_private_info32_t;
18901895

1896+
#if CACHE_LINE <= 128
1897+
KMP_BUILD_ASSERT(sizeof(dispatch_private_info32_t) <= 128);
1898+
#endif
1899+
18911900
typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
18921901
kmp_int64 count; // current chunk number for static & static-steal scheduling
18931902
kmp_int64 ub; /* upper-bound */
@@ -1896,27 +1905,38 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
18961905
kmp_int64 st; /* stride */
18971906
kmp_int64 tc; /* trip count (number of iterations) */
18981907
kmp_lock_t *steal_lock; // lock used for chunk stealing
1908+
1909+
kmp_uint64 ordered_lower;
1910+
kmp_uint64 ordered_upper;
18991911
/* parm[1-4] are used in different ways by different scheduling algorithms */
19001912

1901-
// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
1913+
// KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
19021914
// a) parm3 is properly aligned and
19031915
// b) all parm1-4 are in the same cache line.
19041916
// Because of parm1-4 are used together, performance seems to be better
19051917
// if they are in the same line (not measured though).
1906-
19071918
struct KMP_ALIGN(32) {
19081919
kmp_int64 parm1;
19091920
kmp_int64 parm2;
19101921
kmp_int64 parm3;
19111922
kmp_int64 parm4;
19121923
};
19131924

1914-
kmp_uint64 ordered_lower;
1915-
kmp_uint64 ordered_upper;
1925+
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
1926+
kmp_uint64 pchunks;
1927+
kmp_uint64 num_procs_with_pcore;
1928+
kmp_int64 first_thread_with_ecore;
1929+
#endif
1930+
19161931
#if KMP_OS_WINDOWS
19171932
kmp_int64 last_upper;
19181933
#endif /* KMP_OS_WINDOWS */
19191934
} dispatch_private_info64_t;
1935+
1936+
#if CACHE_LINE <= 128
1937+
KMP_BUILD_ASSERT(sizeof(dispatch_private_info64_t) <= 128);
1938+
#endif
1939+
19201940
#else /* KMP_STATIC_STEAL_ENABLED */
19211941
typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
19221942
kmp_int32 lb;
@@ -3862,6 +3882,9 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
38623882
extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
38633883
extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
38643884
extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
3885+
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
3886+
extern int __kmp_get_first_osid_with_ecore(void);
3887+
#endif
38653888
#if KMP_OS_LINUX || KMP_OS_FREEBSD
38663889
extern int kmp_set_thread_affinity_mask_initial(void);
38673890
#endif

openmp/runtime/src/kmp_affinity.cpp

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4196,7 +4196,7 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
41964196

41974197
// Initiailze ids and attrs thread data
41984198
for (int i = 0; i < KMP_HW_LAST; ++i)
4199-
ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
4199+
ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
42004200
attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
42014201

42024202
// Iterate through each os id within the mask and determine
@@ -4205,19 +4205,20 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
42054205
int depth = __kmp_topology->get_depth();
42064206
KMP_CPU_SET_ITERATE(cpu, mask) {
42074207
int osid_idx = __kmp_osid_to_hwthread_map[cpu];
4208+
ids.os_id = cpu;
42084209
const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
42094210
for (int level = 0; level < depth; ++level) {
42104211
kmp_hw_t type = __kmp_topology->get_type(level);
42114212
int id = hw_thread.sub_ids[level];
4212-
if (ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids[type] == id) {
4213-
ids[type] = id;
4213+
if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
4214+
ids.ids[type] = id;
42144215
} else {
42154216
// This mask spans across multiple topology units, set it as such
42164217
// and mark every level below as such as well.
4217-
ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4218+
ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
42184219
for (; level < depth; ++level) {
42194220
kmp_hw_t type = __kmp_topology->get_type(level);
4220-
ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4221+
ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
42214222
}
42224223
}
42234224
}
@@ -4297,6 +4298,9 @@ static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
42974298
if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
42984299
machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
42994300
__kmp_affinity_get_topology_info(affinity);
4301+
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
4302+
__kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
4303+
#endif
43004304
}
43014305
}
43024306

@@ -4876,7 +4880,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
48764880

48774881
// Set the thread topology information to default of unknown
48784882
for (int id = 0; id < KMP_HW_LAST; ++id)
4879-
th->th.th_topology_ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
4883+
th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
48804884
th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
48814885

48824886
if (!KMP_AFFINITY_CAPABLE()) {
@@ -5273,6 +5277,28 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
52735277
return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
52745278
}
52755279

5280+
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
5281+
// Returns first os proc id with ATOM core
5282+
int __kmp_get_first_osid_with_ecore(void) {
5283+
int low = 0;
5284+
int high = __kmp_topology->get_num_hw_threads() - 1;
5285+
int mid = 0;
5286+
while (high - low > 1) {
5287+
mid = (high + low) / 2;
5288+
if (__kmp_topology->at(mid).attrs.get_core_type() ==
5289+
KMP_HW_CORE_TYPE_CORE) {
5290+
low = mid + 1;
5291+
} else {
5292+
high = mid;
5293+
}
5294+
}
5295+
if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
5296+
return mid;
5297+
}
5298+
return -1;
5299+
}
5300+
#endif
5301+
52765302
// Dynamic affinity settings - Affinity balanced
52775303
void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
52785304
KMP_DEBUG_ASSERT(th);

0 commit comments

Comments
 (0)