-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[clang][cmake] Apply bolt optimizations as part of the clang target #119896
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This change removes the need to call the clang-bolt target in order to apply bolt optimizations to clang. Now running `ninja clang` will build a clang with bolt optimizations, and `ninja check-clang` and `ninja install-clang` will test and install bolt optimized clang too. The clang-bolt target has been kept for compatibilty reasons, but it is now just an alias to the clang target. Also, this new design for applying the bolt optimizations to clang will be easier to generalize and use to optimize other binaries/libraries in the project.
@llvm/pr-subscribers-clang Author: Tom Stellard (tstellar) ChangesThis change removes the need to call the clang-bolt target in order to apply bolt optimizations to clang. Now running The clang-bolt target has been kept for compatibilty reasons, but it is now just an alias to the clang target. Also, this new design for applying the bolt optimizations to clang will be easier to generalize and use to optimize other binaries/libraries in the project. Full diff: https://github.com/llvm/llvm-project/pull/119896.diff 4 Files Affected:
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 27e8095534a65c..b7741ce49b46ca 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -873,58 +873,6 @@ if (CLANG_ENABLE_BOOTSTRAP)
endforeach()
endif()
-set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
- May be specified as Instrument or Perf or LBR to use a particular profiling \
- mechanism.")
-string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
-
-if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
- set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
- set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
- set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata)
-
- # Pass extra flag in no-LBR mode
- if (CLANG_BOLT STREQUAL "PERF")
- set(BOLT_NO_LBR "-nl")
- endif()
-
- if (CLANG_BOLT STREQUAL "INSTRUMENT")
- # Instrument clang with BOLT
- add_custom_target(clang-instrumented
- DEPENDS ${CLANG_INSTRUMENTED}
- )
- add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
- DEPENDS clang llvm-bolt
- COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
- -instrument --instrumentation-file-append-pid
- --instrumentation-file=${BOLT_FDATA}
- COMMENT "Instrumenting clang binary with BOLT"
- USES_TERMINAL
- VERBATIM
- )
- add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented)
- else() # perf or LBR
- add_custom_target(clang-bolt-training-deps DEPENDS clang)
- endif()
-
- # Optimize original (pre-bolt) Clang using the collected profile
- add_custom_target(clang-bolt
- DEPENDS clang-bolt-profile
- COMMAND ${CMAKE_COMMAND} -E rename $<TARGET_FILE:clang> ${CLANG_PATH}-prebolt
- COMMAND ${CMAKE_COMMAND} -E create_symlink ${CLANG_PATH}-prebolt ${CLANG_PATH}++-prebolt
- COMMAND llvm-bolt ${CLANG_PATH}-prebolt
- -o $<TARGET_FILE:clang>
- -data ${BOLT_FDATA}
- -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions
- -split-all-cold -split-eh -dyno-stats -use-gnu-stack
- -update-debug-sections
- ${BOLT_NO_LBR}
- COMMENT "Optimizing Clang with BOLT"
- USES_TERMINAL
- VERBATIM
- )
-endif()
-
if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION)
add_subdirectory(utils/ClangVisualizers)
endif()
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index a4debc2dd2e895..ad336fcc45b600 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -23,6 +23,18 @@ if(CLANG_PLUGIN_SUPPORT)
set(support_plugins SUPPORT_PLUGINS)
endif()
+set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
+ May be specified as Instrument or Perf or LBR to use a particular profiling \
+ mechanism.")
+string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
+
+if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
+ set(CLANG_BOLT_DEPS clear-bolt-fdata llvm-bolt llvm-readobj)
+ if (NOT CLANG_BOLT STREQUAL "INSTRUMENT")
+ list(APPEND CLANG_BOLT_DEPS clear-perf-data)
+ endif()
+endif()
+
add_clang_tool(clang
driver.cpp
cc1_main.cpp
@@ -35,6 +47,7 @@ add_clang_tool(clang
ARMTargetParserTableGen
AArch64TargetParserTableGen
${support_plugins}
+ ${CLANG_BOLT_DEPS}
GENERATE_DRIVER
)
@@ -134,3 +147,42 @@ if(CLANG_ORDER_FILE AND
set_target_properties(clang PROPERTIES LINK_DEPENDS ${CLANG_ORDER_FILE})
endif()
endif()
+
+if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
+ # Add a clang-bolt target for backwards compatibility.
+ add_custom_target(clang-bolt DEPENDS clang)
+
+ set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
+ "Name of BOLT-instrumented Clang binary")
+ set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
+ set(PERF_TRAINING_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../utils/perf-training)
+ set(BOLT_FDATA ${PERF_TRAINING_BINARY_DIR}/prof.fdata)
+ get_llvm_lit_path(
+ lit_base_dir
+ lit_file_name
+ ALLOW_EXTERNAL
+ )
+ set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}")
+
+ # This POST_BUILD command is executed unconditionally even if the clang target
+ # is already built. We need to wrap the whole bolt optimization process in
+ # a single python wrapper, so that we can first check if the binary has
+ # already been optimized and then exit early with a 0 status if it has.
+ add_custom_command(
+ TARGET clang POST_BUILD
+ COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py
+ bolt-optimize
+ --method ${CLANG_BOLT}
+ --input $<TARGET_FILE:clang>
+ --instrumented-output ${CLANG_INSTRUMENTED}
+ --fdata ${BOLT_FDATA}
+ --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
+ --readelf $<TARGET_FILE:llvm-readobj>
+ --bolt $<TARGET_FILE:llvm-bolt>
+ --lit "${LIT_COMMAND}"
+ --merge-fdata $<TARGET_FILE:merge-fdata>
+ COMMENT "Optimizing Clang with BOLT"
+ USES_TERMINAL
+ VERBATIM
+ )
+endif()
diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt
index 49673790ff6e84..4aed086563ee92 100644
--- a/clang/utils/perf-training/CMakeLists.txt
+++ b/clang/utils/perf-training/CMakeLists.txt
@@ -83,8 +83,6 @@ if(APPLE AND DTRACE AND NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
endif()
if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
- set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
- "Name of BOLT-instrumented Clang binary")
configure_lit_site_cfg(
${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in
${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg
@@ -93,7 +91,7 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang"
${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/
EXCLUDE_FROM_CHECK_ALL
- DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data
+ DEPENDS clear-bolt-fdata clear-perf-data
)
add_custom_target(clear-bolt-fdata
@@ -104,26 +102,4 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
COMMENT "Clearing old perf data")
- string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
- if (CLANG_BOLT STREQUAL "LBR")
- set(BOLT_LBR "--lbr")
- endif()
-
- add_custom_target(merge-fdata-deps)
- if (CLANG_BOLT STREQUAL "INSTRUMENT")
- add_dependencies(merge-fdata-deps generate-bolt-fdata)
- else()
- # Convert perf profiles into fdata
- add_custom_target(convert-perf-fdata
- COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $<TARGET_FILE:llvm-bolt> ${CMAKE_CURRENT_BINARY_DIR} $<TARGET_FILE:clang> ${BOLT_LBR}
- COMMENT "Converting perf files to BOLT fdata"
- DEPENDS llvm-bolt generate-bolt-fdata)
- add_dependencies(merge-fdata-deps convert-perf-fdata)
- endif()
-
- # Merge profiles into one using merge-fdata
- add_custom_target(clang-bolt-profile
- COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR}
- COMMENT "Merging BOLT fdata"
- DEPENDS merge-fdata merge-fdata-deps)
endif()
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index d76c6ede3fe5a2..098c5973096122 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -16,6 +16,8 @@
import bisect
import shlex
import tempfile
+import re
+import shutil
test_env = {"PATH": os.environ["PATH"]}
@@ -558,7 +560,74 @@ def genOrderFile(args):
return 0
+def bolt_optimize(args):
+ parser = argparse.ArgumentParser("%prog [options] ")
+ parser.add_argument('--method')
+ parser.add_argument('--input')
+ parser.add_argument('--instrumented-output')
+ parser.add_argument('--fdata')
+ parser.add_argument('--perf-training-binary-dir')
+ parser.add_argument('--readelf')
+ parser.add_argument('--bolt')
+ parser.add_argument('--lit')
+ parser.add_argument('--merge-fdata')
+
+ opts = parser.parse_args(args)
+
+ readelf = opts.readelf
+ bolt = opts.bolt
+ lit = opts.lit
+
+ output = subprocess.check_output(
+ [readelf, "-WS", opts.input], universal_newlines=True
+ )
+
+ # This binary has already been bolt-optimized, so skip further processing.
+ if re.search('\\.bolt\\.org\\.text', output, re.MULTILINE):
+ return 0
+
+ if opts.method == 'INSTRUMENT':
+ process = subprocess.run(
+ [bolt, opts.input, '-o', opts.instrumented_output, '-instrument', '--instrumentation-file-append-pid',
+ f'--instrumentation-file={opts.fdata}'],
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text = True)
+
+ print(process.args)
+ for line in process.stdout:
+ sys.stdout.write(line)
+ process.check_returncode()
+
+ process = subprocess.run(
+ [sys.executable, lit, os.path.join(opts.perf_training_binary_dir, 'bolt-fdata')],
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text = True)
+
+ print(process.args)
+ for line in process.stdout:
+ sys.stdout.write(line)
+ process.check_returncode()
+
+ if opts.method == 'PERF':
+ perf2bolt([bolt, opts.perf_training_binary_dir, opts.input])
+
+ merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
+
+ shutil.copy(opts.input, f'{opts.input}-prebolt')
+
+ process = subprocess.run(
+ [bolt, f'{opts.input}-prebolt', '-o', opts.input, '-data', opts.fdata,
+ '-reorder-blocks=ext-tsp', '-reorder-functions=cdsort', '-split-functions',
+ '-split-all-cold', '-split-eh', '-dyno-stats', '-use-gnu-stack', '-update-debug-sections',
+ '-nl' if opts.method == 'PERF' else ''],
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text = True)
+
+ print(process.args)
+ for line in process.stdout:
+ sys.stdout.write(line)
+ process.check_returncode()
+
+
commands = {
+ "bolt-optimize" : bolt_optimize,
"clean": clean,
"merge": merge,
"dtrace": dtrace,
|
✅ With the latest revision this PR passed the Python code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you, looks good overall. When you land, please monitor clang-bolt builder which exercises this logic.
Co-authored-by: Amir Ayupov <[email protected]>
--instrumented-output ${CLANG_INSTRUMENTED} | ||
--fdata ${BOLT_FDATA} | ||
--perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR} | ||
--readelf $<TARGET_FILE:llvm-readobj> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
--readelf $<TARGET_FILE:llvm-readobj> | |
--readelf $<TARGET_FILE:llvm-readelf> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had to revert this one. llvm-readelf is just a symlink, so there is no llvm-readelf target.
readelf = opts.readelf | ||
bolt = opts.bolt | ||
lit = opts.lit |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just a nit, but these variables are only used a few times so I'd probably just inline them for simplicity.
Co-authored-by: Petr Hosek <[email protected]>
Co-authored-by: Petr Hosek <[email protected]>
This reverts commit 543dffe.
@petrhosek How does the latest version look to you? |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/143/builds/4933 Here is the relevant piece of the build log for the reference
|
This change removes the need to call the clang-bolt target in order to apply bolt optimizations to clang. Now running
ninja clang
will build a clang with bolt optimizations, andninja check-clang
andninja install-clang
will test and install bolt optimized clang too.The clang-bolt target has been kept for compatibilty reasons, but it is now just an alias to the clang target.
Also, this new design for applying the bolt optimizations to clang will be easier to generalize and use to optimize other binaries/libraries in the project.