Skip to content

Commit 3f6ae3f

Browse files
authored
[flang] Added driver options for arrays repacking. (#134002)
Added options: * -f[no-]repack-arrays * -f[no-]stack-repack-arrays * -frepack-arrays-contiguity=whole/innermost
1 parent 3e59ff2 commit 3f6ae3f

File tree

11 files changed

+201
-38
lines changed

11 files changed

+201
-38
lines changed

clang/include/clang/Driver/Options.td

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6825,7 +6825,6 @@ defm real_8_real_10 : BooleanFFlag<"real-8-real-10">, Group<gfortran_Group>;
68256825
defm real_8_real_16 : BooleanFFlag<"real-8-real-16">, Group<gfortran_Group>;
68266826
defm real_8_real_4 : BooleanFFlag<"real-8-real-4">, Group<gfortran_Group>;
68276827
defm recursive : BooleanFFlag<"recursive">, Group<gfortran_Group>;
6828-
defm repack_arrays : BooleanFFlag<"repack-arrays">, Group<gfortran_Group>;
68296828
defm second_underscore : BooleanFFlag<"second-underscore">, Group<gfortran_Group>;
68306829
defm sign_zero : BooleanFFlag<"sign-zero">, Group<gfortran_Group>;
68316830
defm whole_file : BooleanFFlag<"whole-file">, Group<gfortran_Group>;
@@ -6967,6 +6966,52 @@ defm unsigned : OptInFC1FFlag<"unsigned", "Enables UNSIGNED type">;
69676966
def fno_automatic : Flag<["-"], "fno-automatic">, Group<f_Group>,
69686967
HelpText<"Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE">;
69696968

6969+
defm repack_arrays
6970+
: BoolOptionWithoutMarshalling<
6971+
"f", "repack-arrays", PosFlag<SetTrue, [], [], "Pack">,
6972+
NegFlag<SetFalse, [], [], "Do not pack">,
6973+
BothFlags<[], [],
6974+
" non-contiguous assumed shape dummy arrays into "
6975+
"contiguous memory">>,
6976+
DocBrief<[{Create temporary copies of non-contiguous assumed shape dummy
6977+
arrays in subprogram prologues, and destroy them in subprogram epilogues.
6978+
The temporary copy is initialized with values from the original array
6979+
in the prologue, if needed. In the epilogue, the current values
6980+
in the temporary array are copied into the original array, if needed.
6981+
6982+
Accessing the contiguous temporary in the program code may result
6983+
in faster execution comparing to accessing elements of the original array,
6984+
when they are sparse in memory. At the same time, the overhead
6985+
of copying values between the original and the temporary arrays
6986+
may be significant, which may slow down some programs.
6987+
6988+
Enabling array repacking may also change the behavior of certain
6989+
programs:
6990+
6991+
* The copy actions may introduce a data race in valid OpenACC/OpenMP programs.
6992+
For example, if different threads execute the same subprogram
6993+
with a non-contiguous assumed shape dummy array, and the different threads
6994+
access unrelated parts of the array, then the whole array copy
6995+
made in each thread will cause a data race.
6996+
* OpenACC/OpenMP offload programs may behave incorrectly with regards
6997+
to the device data environment, due to the fact that the original
6998+
array and the temporary may have different presence status on the device.
6999+
* ``IS_CONTIGUOUS`` intrinsic may return ``TRUE`` with the array repacking
7000+
enabled, whereas if would return ``FALSE`` with the repacking disabled.
7001+
* The result of ``LOC`` intrinsic applied to an actual argument associated
7002+
with a non-contiguous assumed shape dummy array, may be different
7003+
from the result of ``LOC`` applied to the dummy array.}]>;
7004+
7005+
def frepack_arrays_contiguity_EQ
7006+
: Joined<["-"], "frepack-arrays-contiguity=">,
7007+
Group<f_Group>,
7008+
Values<"whole,innermost">,
7009+
HelpText<
7010+
"When -frepack-arrays is in effect, 'whole' enables "
7011+
"repacking for arrays that are non-contiguous in any dimension, "
7012+
"'innermost' enables repacking for arrays that are non-contiguous "
7013+
"in the innermost dimension (the default)">;
7014+
69707015
defm save_main_program : BoolOptionWithoutMarshalling<"f", "save-main-program",
69717016
PosFlag<SetTrue, [], [],
69727017
"Place all main program variables in static memory (otherwise scalars may be placed on the stack)">,
@@ -6980,6 +7025,22 @@ defm loop_versioning : BoolOptionWithoutMarshalling<"f", "version-loops-for-stri
69807025
PosFlag<SetTrue, [], [ClangOption], "Create unit-strided versions of loops">,
69817026
NegFlag<SetFalse, [], [ClangOption], "Do not create unit-strided loops (default)">>;
69827027

7028+
defm stack_repack_arrays
7029+
: BoolOptionWithoutMarshalling<
7030+
"f", "stack-repack-arrays",
7031+
PosFlag<SetTrue, [], [],
7032+
"Attempt to allocate array temporaries created under "
7033+
"-frepack-arrays on the stack">,
7034+
NegFlag<
7035+
SetFalse, [], [],
7036+
"Allocate -frepack-arrays temporaries on the heap (default)">>,
7037+
DocBrief<[{Controls whether the array temporaries created under
7038+
**-frepack-arrays** are allocated on the stack or on the heap.
7039+
7040+
By default, the heap is used. Allocations of polymorphic types
7041+
are always done on the heap, though this may change in future releases.
7042+
}]>;
7043+
69837044
def fhermetic_module_files : Flag<["-"], "fhermetic-module-files">, Group<f_Group>,
69847045
HelpText<"Emit hermetic module files (no nested USE association)">;
69857046

clang/lib/Driver/ToolChains/Flang.cpp

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -157,16 +157,26 @@ void Flang::addCodegenOptions(const ArgList &Args,
157157
if (shouldLoopVersion(Args))
158158
CmdArgs.push_back("-fversion-loops-for-stride");
159159

160-
Args.addAllArgs(CmdArgs,
161-
{options::OPT_fdo_concurrent_to_openmp_EQ,
162-
options::OPT_flang_experimental_hlfir,
163-
options::OPT_flang_deprecated_no_hlfir,
164-
options::OPT_fno_ppc_native_vec_elem_order,
165-
options::OPT_fppc_native_vec_elem_order,
166-
options::OPT_finit_global_zero,
167-
options::OPT_fno_init_global_zero, options::OPT_ftime_report,
168-
options::OPT_ftime_report_EQ, options::OPT_funroll_loops,
169-
options::OPT_fno_unroll_loops});
160+
for (const auto &arg :
161+
Args.getAllArgValues(options::OPT_frepack_arrays_contiguity_EQ))
162+
if (arg.compare("whole") != 0 && arg.compare("innermost") != 0) {
163+
getToolChain().getDriver().Diag(diag::err_drv_unsupported_option_argument)
164+
<< "-frepack-arrays-contiguity=" << arg;
165+
}
166+
167+
Args.addAllArgs(
168+
CmdArgs,
169+
{options::OPT_fdo_concurrent_to_openmp_EQ,
170+
options::OPT_flang_experimental_hlfir,
171+
options::OPT_flang_deprecated_no_hlfir,
172+
options::OPT_fno_ppc_native_vec_elem_order,
173+
options::OPT_fppc_native_vec_elem_order, options::OPT_finit_global_zero,
174+
options::OPT_fno_init_global_zero, options::OPT_frepack_arrays,
175+
options::OPT_fno_repack_arrays,
176+
options::OPT_frepack_arrays_contiguity_EQ,
177+
options::OPT_fstack_repack_arrays, options::OPT_fno_stack_repack_arrays,
178+
options::OPT_ftime_report, options::OPT_ftime_report_EQ,
179+
options::OPT_funroll_loops, options::OPT_fno_unroll_loops});
170180
}
171181

172182
void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const {

flang/docs/ArrayRepacking.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@ Having these results it seems reasonable to provide support for arrays repacking
3939

4040
#### Facts and guesses about the implementation
4141

42-
The dynamic checks for continuity and the array copy code is located completely in the [runtime](https://github.com/gcc-mirror/gcc/blob/3e08a4ecea27c54fda90e8f58641b1986ad957e1/libgfortran/generated/in_pack_r8.c#L35), so the compiler inserts unconditional calls in the subprogram prologue/epilogue.
42+
The dynamic checks for contiguity and the array copy code is located completely in the [runtime](https://github.com/gcc-mirror/gcc/blob/3e08a4ecea27c54fda90e8f58641b1986ad957e1/libgfortran/generated/in_pack_r8.c#L35), so the compiler inserts unconditional calls in the subprogram prologue/epilogue.
4343

4444
It looks like `gfortran` ignores `intent(out)/intent(in)` which could have helped to avoid some of the `pack/unpack` overhead.
4545

4646
It looks like the `pack`/`unpack` actions are inserted early in the compilation pipeline, and these extra calls affect behavior of the later optimization passes. For example, `Polyhedron/fatigue2` slows down by about 2x with `-frepack-arrays`: this slowdown is not caused by the `pack`/`unpack` overhead, but is a consequence of worse function inlining decisions made after the calls insertion. The benchmarks becomes even faster than the original version with `-frepack-arrays` and proper `-finline-limit=` settings, but it does not look like the benchmark contains code that would benefit from the array repacking.
4747

48-
It does not look like `gfortran` is able to eliminate the `pack`/`unpack` code after the function inlining, if the actual argument is statically known to be contiguous. So the overhead from the dynamic continuity checks is inevitable when `-frepack-arrays` is specified.
48+
It does not look like `gfortran` is able to eliminate the `pack`/`unpack` code after the function inlining, if the actual argument is statically known to be contiguous. So the overhead from the dynamic contiguity checks is inevitable when `-frepack-arrays` is specified.
4949

5050
It does not look like `gfortran` tries to optimize the insertion of `pack`/`unpack` code. For example, if a dummy array is only used under a condition within the subprogram, the repacking code might be inserted under the same condition to minimize the overhead on the unconditional path through the subprogram.
5151

@@ -59,7 +59,7 @@ It does not look like `gfortran` tries to optimize the insertion of `pack`/`unpa
5959

6060
#### Facts and guesses about the implementation
6161

62-
The `pack` code is only generated if the actual argument may be non-contiguous in the innermost dimension, as determined statically, i.e. the compiler does not generate any dynamic continuity checks. For example:
62+
The `pack` code is only generated if the actual argument may be non-contiguous in the innermost dimension, as determined statically, i.e. the compiler does not generate any dynamic contiguity checks. For example:
6363

6464
```Fortran
6565
interface
@@ -132,8 +132,8 @@ So it does not seem practical/reasonable to enable the array repacking by defaul
132132
### Performance
133133

134134
1. Minimize the overhead of array repacking, e.g. avoid copy-in/out whenever possible, execute copy-in/out only on the execution paths where the array is accessed.
135-
2. Provide different modes of repacking depending on the "continuity" meaning, i.e. one - array is contiguous in the innermost dimension, two - array is contiguous in all dimensions.
136-
3. Avoid generating repacking code, when the "continuity" can be statically proven (including after optimization passes like constant propagation, function inlining, etc.).
135+
2. Provide different modes of repacking depending on the "contiguity" meaning, i.e. one - array is contiguous in the innermost dimension, two - array is contiguous in all dimensions.
136+
3. Avoid generating repacking code, when the "contiguity" can be statically proven (including after optimization passes like constant propagation, function inlining, etc.).
137137
4. Use a set of heuristics to avoid generating repacking code based on the array usage pattern, e.g. if an array is proven not to be used in an array expression or a loop, etc.
138138
5. Use a set of heuristics to avoid repacking actions dynamically, e.g. based on the array size, element size, byte stride(s) of the [innermost] dimension(s), etc.
139139
6. Minimize the impact of the IR changes, introduced by repacking, on the later optimization passes.
@@ -156,7 +156,7 @@ Controlled by cli options, Lowering will generate a `fir.pack_array` operation i
156156
The new operations will hold all the information that customizes further handling of the `pack`/`unpack` actions, such as:
157157

158158
* Optional array of attributes supporting an interface to generate a predicate that says if the repacking is safe in the current context.
159-
* The continuity mode: `innermost` vs `whole`.
159+
* The contiguity mode: `innermost` vs `whole`.
160160
* Attributes selecting the heuristics (both compiler and runtime ones) that may be applied to avoid `pack`/`unpack` actions.
161161
* Other attributes, like `stack` vs `heap` to manage the temporary allocation according to `-fstack-arrays`, etc.
162162

@@ -195,7 +195,7 @@ The operation creates a new `!fir.box/class<!fir.array<>>` value to represent ei
195195
Arguments:
196196

197197
* `stack` - indicates if `-fstack-arrays` is in effect for compiling this function.
198-
* `innermost` - tells that the repacking has to be done iff the array is not contiguous in the innermost dimension. This also describes what type of continuity can be expected from `%new_var`, i.e. `innermost` means that the resulting array is definitely contiguous in the innermost dimension, but may be non-contiguous in other dimensions (unless additional analysis proves otherwise). For 1-D arrays, `innermost` attribute is not valid.
198+
* `innermost` - tells that the repacking has to be done iff the array is not contiguous in the innermost dimension. This also describes what type of contiguity can be expected from `%new_var`, i.e. `innermost` means that the resulting array is definitely contiguous in the innermost dimension, but may be non-contiguous in other dimensions (unless additional analysis proves otherwise). For 1-D arrays, `innermost` attribute is not valid.
199199
* `no_copy` - indicates that, in case a temporary array is created, `%var` to `%new_var` copy is not required (`intent(out)` dummy argument case).
200200
* `heuristics`
201201
* `loop-only` - `fir.pack_array` can be optimized away, if the array is not used in a loop.
@@ -351,7 +351,7 @@ The `fir.pack_array`'s copy-in action cannot be skipped for `INTENT(OUT)` dummy
351351

352352
#### Optional behavior
353353

354-
In case of the `whole` continuity mode or with 1-D array, Flang can propagate this information to `hlfir.declare` - this may improve optimizations down the road. This can be done iff the repacking has no dynamic constraints and/or heuristics. For example:
354+
In case of the `whole` contiguity mode or with 1-D array, Flang can propagate this information to `hlfir.declare` - this may improve optimizations down the road. This can be done iff the repacking has no dynamic constraints and/or heuristics. For example:
355355

356356
```
357357
%c0 = arith.constant 0 : index
@@ -441,10 +441,11 @@ In cases where `fir.pack_array` is statically known to produce a copy that is co
441441
The following user options are proposed:
442442
443443
* `-frepack-arrays` - the option forces Flang to repack a non-contiguous assumed-shape dummy array into a temporary contiguous memory, which may result in faster accesses of the array. The compiler will insert special code in subprogram prologue to allocate a temporary array and copy the original array into the temporary; in subprogram epilogue, it will insert a copy from the temporary array into the original array and deallocate the temporary. The overhead of the allocation/deallocation and the copies may be significant depending on the array size. The compiler will try to optimize the unnecessary/unprofitable repacking.
444+
* `-fstack-repack-arrays` - attempt allocating the temporary arrays in stack memory. By default, they are allocated in heap memory (note that `-fstack-arrays` does not affect the allocation of the temporaries created for the arrays repacking).
444445
* `-frepack-arrays-opts=[none|loop-only]` - the option enables optimizations that may eliminate the array repacking code depending on the array usage pattern:
445446
* `none` - no optimizations.
446447
* `loop-only` - the array repacking code will be removed in any subprogram where the array is not used inside a loop or an array expression.
447-
* `-frepack-arrays-continuity=[whole|innermost]`:
448+
* `-frepack-arrays-contiguity=[whole|innermost]`:
448449
* `whole` - the option will repack arrays that are non-contiguous in any dimension (default).
449450
* `innermost` - the option will repack arrays that are non-contiguous in the innermost dimension.
450451
* `-frepack-arrays-max-size=<int>` - arrays bigger than the specified size will not be repacked.

flang/include/flang/Lower/LoweringOptions.def

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,15 @@ ENUM_LOWERINGOPT(ReallocateLHS, unsigned, 1, 1)
4848
/// On by default.
4949
ENUM_LOWERINGOPT(InitGlobalZero, unsigned, 1, 1)
5050

51-
/// If true, the arrays of unknown size and array temporaries
52-
/// are requested to be allocated in stack memory.
53-
ENUM_LOWERINGOPT(StackArrays, unsigned, 1, 0)
54-
5551
/// If true, the dummy assumed shape arrays are conditionally
5652
/// packed into contiguous memory.
5753
ENUM_LOWERINGOPT(RepackArrays, unsigned, 1, 0)
5854

55+
/// If true, the temporary arrays created under RepackArrays
56+
/// control will be allocated in stack memory. If false,
57+
/// they will be allocated in heap memory.
58+
ENUM_LOWERINGOPT(StackRepackArrays, unsigned, 1, 0)
59+
5960
/// If true, the repacking (RepackArrays option above)
6061
/// will be done for arrays non-contiguous in any dimension,
6162
/// otherwise, it will be done only for arrays non-contiguous

flang/lib/Frontend/CompilerInvocation.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,6 +1476,19 @@ bool CompilerInvocation::createFromArgs(
14761476
clang::driver::options::OPT_fno_realloc_lhs, true))
14771477
invoc.loweringOpts.setReallocateLHS(false);
14781478

1479+
invoc.loweringOpts.setRepackArrays(
1480+
args.hasFlag(clang::driver::options::OPT_frepack_arrays,
1481+
clang::driver::options::OPT_fno_repack_arrays,
1482+
/*default=*/false));
1483+
invoc.loweringOpts.setStackRepackArrays(
1484+
args.hasFlag(clang::driver::options::OPT_fstack_repack_arrays,
1485+
clang::driver::options::OPT_fno_stack_repack_arrays,
1486+
/*default=*/false));
1487+
if (auto *arg = args.getLastArg(
1488+
clang::driver::options::OPT_frepack_arrays_contiguity_EQ))
1489+
invoc.loweringOpts.setRepackArraysWhole(arg->getValue() ==
1490+
llvm::StringRef{"whole"});
1491+
14791492
success &= parseFrontendArgs(invoc.getFrontendOpts(), args, diags);
14801493
parseTargetArgs(invoc.getTargetOpts(), args);
14811494
parsePreprocessorArgs(invoc.getPreprocessorOpts(), args);

flang/lib/Lower/ConvertVariable.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2630,7 +2630,7 @@ Fortran::lower::genPackArray(Fortran::lower::AbstractConverter &converter,
26302630
});
26312631
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
26322632
const mlir::Location loc = genLocation(converter, sym);
2633-
bool stackAlloc = opts.getStackArrays();
2633+
bool stackAlloc = opts.getStackRepackArrays();
26342634
// 1D arrays must always use 'whole' mode.
26352635
bool isInnermostMode = !opts.getRepackArraysWhole() && sym.Rank() > 1;
26362636
// Avoid copy-in for 'intent(out)' variable, unless this is a dummy
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
! Test forwarding of -frepack-arrays-contiguity options:
2+
! RUN: %flang -frepack-arrays-contiguity=whole %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=WHOLECMD %s
3+
! RUN: %flang -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=INNERMOSTCMD %s
4+
! RUN: %flang -frepack-arrays-contiguity=innermost -frepack-arrays-contiguity=whole %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=WHOLECMD %s
5+
! RUN: %flang -frepack-arrays-contiguity=whole -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=INNERMOSTCMD %s
6+
! RUN: not %flang -frepack-arrays-contiguity= -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=ERROR %s
7+
! RUN: not %flang -frepack-arrays-contiguity=whole3 -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=ERROR %s
8+
! RUN: not %flang -frepack-arrays-contiguity=innermostg -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=ERROR %s
9+
10+
! Test proper setting of the lowering options:
11+
! RUN: %flang_fc1 -frepack-arrays -frepack-arrays-contiguity=whole %s -emit-hlfir -o - | FileCheck --check-prefix=WHOLE %s
12+
! RUN: %flang_fc1 -frepack-arrays-contiguity=whole %s -emit-hlfir -o - | FileCheck --check-prefix=NOREPACK %s
13+
! RUN: %flang_fc1 -frepack-arrays -frepack-arrays-contiguity=innermost %s -emit-hlfir -o - | FileCheck --check-prefix=INNERMOST %s
14+
! RUN: %flang_fc1 -frepack-arrays-contiguity=innermost %s -emit-hlfir -o - | FileCheck --check-prefix=NOREPACK %s
15+
16+
! Default setting is 'innermost':
17+
! RUN: %flang_fc1 -frepack-arrays %s -emit-hlfir -o - | FileCheck --check-prefix=INNERMOST %s
18+
19+
! ERROR: error: unsupported argument '{{.*}}' to option '-frepack-arrays-contiguity='
20+
21+
! WHOLECMD: "-fc1"{{.*}}"-frepack-arrays-contiguity=whole"
22+
! INNERMOSTCMD: "-fc1"{{.*}}"-frepack-arrays-contiguity=innermost"
23+
24+
subroutine test(x)
25+
real :: x(:,:)
26+
! WHOLE: fir.pack_array{{.*}}whole
27+
! WHOLE: fir.unpack_array
28+
! INERMOST: fir.pack_array{{.*}}innermost
29+
! INNERMOST: fir.unpack_array
30+
! NOREPACK-NOT: fir.pack_array
31+
! NOREPACK-NOT: fir.unpack_array
32+
end subroutine

0 commit comments

Comments
 (0)