Skip to content

[OpenMP] Add skewed iteration distribution on hybrid systems #69946

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 45 additions & 22 deletions openmp/runtime/src/kmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
#ifndef KMP_STATIC_STEAL_ENABLED
#define KMP_STATIC_STEAL_ENABLED 1
#endif
#define KMP_WEIGHTED_ITERATIONS_SUPPORTED \
(KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED && \
(KMP_ARCH_X86 || KMP_ARCH_X86_64))

#define TASK_CURRENT_NOT_QUEUED 0
#define TASK_CURRENT_QUEUED 1
Expand Down Expand Up @@ -881,14 +884,8 @@ typedef struct kmp_affinity_flags_t {
KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4);

typedef struct kmp_affinity_ids_t {
int os_id;
int ids[KMP_HW_LAST];
int operator[](size_t idx) const { return ids[idx]; }
int &operator[](size_t idx) { return ids[idx]; }
kmp_affinity_ids_t &operator=(const kmp_affinity_ids_t &rhs) {
for (int i = 0; i < KMP_HW_LAST; ++i)
ids[i] = rhs[i];
return *this;
}
} kmp_affinity_ids_t;

typedef struct kmp_affinity_attrs_t {
Expand Down Expand Up @@ -938,6 +935,10 @@ extern kmp_affin_mask_t *__kmp_affin_fullMask;
extern kmp_affin_mask_t *__kmp_affin_origMask;
extern char *__kmp_cpuinfo_file;

#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
extern int __kmp_first_osid_with_ecore;
#endif

#endif /* KMP_AFFINITY_SUPPORTED */

// This needs to be kept in sync with the values in omp.h !!!
Expand Down Expand Up @@ -1845,12 +1846,9 @@ typedef struct kmp_sched_flags {
unsigned ordered : 1;
unsigned nomerge : 1;
unsigned contains_last : 1;
#if KMP_USE_HIER_SCHED
unsigned use_hier : 1;
unsigned unused : 28;
#else
unsigned unused : 29;
#endif
unsigned use_hier : 1; // Used in KMP_USE_HIER_SCHED code
unsigned use_hybrid : 1; // Used in KMP_WEIGHTED_ITERATIONS_SUPPORTED code
unsigned unused : 27;
} kmp_sched_flags_t;

KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
Expand All @@ -1864,26 +1862,37 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 st;
kmp_int32 tc;
kmp_lock_t *steal_lock; // lock used for chunk stealing

kmp_uint32 ordered_lower;
kmp_uint32 ordered_upper;

// KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
// a) parm3 is properly aligned and
// b) all parm1-4 are on the same cache line.
// Because of parm1-4 are used together, performance seems to be better
// if they are on the same cache line (not measured though).

struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
kmp_int32 parm1; // structures in kmp_dispatch.cpp. This should
kmp_int32 parm2; // make no real change at least while padding is off.
struct KMP_ALIGN(32) {
kmp_int32 parm1;
kmp_int32 parm2;
kmp_int32 parm3;
kmp_int32 parm4;
};

kmp_uint32 ordered_lower;
kmp_uint32 ordered_upper;
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
kmp_uint32 pchunks;
kmp_uint32 num_procs_with_pcore;
kmp_int32 first_thread_with_ecore;
#endif
#if KMP_OS_WINDOWS
kmp_int32 last_upper;
#endif /* KMP_OS_WINDOWS */
} dispatch_private_info32_t;

#if CACHE_LINE <= 128
KMP_BUILD_ASSERT(sizeof(dispatch_private_info32_t) <= 128);
#endif

typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 count; // current chunk number for static & static-steal scheduling
kmp_int64 ub; /* upper-bound */
Expand All @@ -1892,27 +1901,38 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 st; /* stride */
kmp_int64 tc; /* trip count (number of iterations) */
kmp_lock_t *steal_lock; // lock used for chunk stealing

kmp_uint64 ordered_lower;
kmp_uint64 ordered_upper;
/* parm[1-4] are used in different ways by different scheduling algorithms */

// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
// KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
// a) parm3 is properly aligned and
// b) all parm1-4 are in the same cache line.
// Because of parm1-4 are used together, performance seems to be better
// if they are in the same line (not measured though).

struct KMP_ALIGN(32) {
kmp_int64 parm1;
kmp_int64 parm2;
kmp_int64 parm3;
kmp_int64 parm4;
};

kmp_uint64 ordered_lower;
kmp_uint64 ordered_upper;
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
kmp_uint64 pchunks;
kmp_uint64 num_procs_with_pcore;
kmp_int64 first_thread_with_ecore;
#endif

#if KMP_OS_WINDOWS
kmp_int64 last_upper;
#endif /* KMP_OS_WINDOWS */
} dispatch_private_info64_t;

#if CACHE_LINE <= 128
KMP_BUILD_ASSERT(sizeof(dispatch_private_info64_t) <= 128);
#endif

#else /* KMP_STATIC_STEAL_ENABLED */
typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 lb;
Expand Down Expand Up @@ -3821,6 +3841,9 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
extern int __kmp_get_first_osid_with_ecore(void);
#endif
#if KMP_OS_LINUX || KMP_OS_FREEBSD
extern int kmp_set_thread_affinity_mask_initial(void);
#endif
Expand Down
38 changes: 32 additions & 6 deletions openmp/runtime/src/kmp_affinity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4163,7 +4163,7 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,

// Initiailze ids and attrs thread data
for (int i = 0; i < KMP_HW_LAST; ++i)
ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
attrs = KMP_AFFINITY_ATTRS_UNKNOWN;

// Iterate through each os id within the mask and determine
Expand All @@ -4172,19 +4172,20 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
int depth = __kmp_topology->get_depth();
KMP_CPU_SET_ITERATE(cpu, mask) {
int osid_idx = __kmp_osid_to_hwthread_map[cpu];
ids.os_id = cpu;
const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
for (int level = 0; level < depth; ++level) {
kmp_hw_t type = __kmp_topology->get_type(level);
int id = hw_thread.sub_ids[level];
if (ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids[type] == id) {
ids[type] = id;
if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
ids.ids[type] = id;
} else {
// This mask spans across multiple topology units, set it as such
// and mark every level below as such as well.
ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
for (; level < depth; ++level) {
kmp_hw_t type = __kmp_topology->get_type(level);
ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
}
}
}
Expand Down Expand Up @@ -4264,6 +4265,9 @@ static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
__kmp_affinity_get_topology_info(affinity);
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
__kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
#endif
}
}

Expand Down Expand Up @@ -4843,7 +4847,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {

// Set the thread topology information to default of unknown
for (int id = 0; id < KMP_HW_LAST; ++id)
th->th.th_topology_ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;

if (!KMP_AFFINITY_CAPABLE()) {
Expand Down Expand Up @@ -5240,6 +5244,28 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
}

#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
// Returns first os proc id with ATOM core
int __kmp_get_first_osid_with_ecore(void) {
int low = 0;
int high = __kmp_topology->get_num_hw_threads() - 1;
int mid = 0;
while (high - low > 1) {
mid = (high + low) / 2;
if (__kmp_topology->at(mid).attrs.get_core_type() ==
KMP_HW_CORE_TYPE_CORE) {
low = mid + 1;
} else {
high = mid;
}
}
if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
return mid;
}
return -1;
}
#endif

// Dynamic affinity settings - Affinity balanced
void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
KMP_DEBUG_ASSERT(th);
Expand Down
Loading