gcc/0207-Use-AI-ability-to-enable-Link-Time-Optimization.patch
2024-09-25 10:00:14 +08:00

617 lines
20 KiB
Diff
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From 8db421d94ad808c51c86514d7170c97e7704fd6d Mon Sep 17 00:00:00 2001
From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com>
Date: Mon, 23 Sep 2024 19:14:39 +0800
Subject: [PATCH] Use AI ability to enable Link Time Optimization.
---
gcc/collect2.c | 230 ++++++++++++++++++++++++++++++++++-
gcc/config/aarch64/aarch64.c | 24 +++-
gcc/ipa-hardware-detection.c | 145 +++++-----------------
gcc/opts-common.c | 47 ++++---
4 files changed, 316 insertions(+), 130 deletions(-)
diff --git a/gcc/collect2.c b/gcc/collect2.c
index f8a5ce459..d4b6a1849 100644
--- a/gcc/collect2.c
+++ b/gcc/collect2.c
@@ -51,7 +51,7 @@ along with GCC; see the file COPYING3. If not see
#include "obstack.h"
#include "intl.h"
#include "version.h"
-
+
/* On certain systems, we have code that works by scanning the object file
directly. But this code uses system-specific header files and library
functions, so turn it off in a cross-compiler. Likewise, the names of
@@ -207,6 +207,7 @@ static int static_obj; /* true if -static */
static const char *c_file; /* <xxx>.c for constructor/destructor list. */
static const char *o_file; /* <xxx>.o for constructor/destructor list. */
+static const char *ai_optimize_file; /* <xxx>.o for ai optimization file. */
#ifdef COLLECT_EXPORT_LIST
static const char *export_file; /* <xxx>.x for AIX export list. */
#endif
@@ -745,6 +746,131 @@ maybe_run_lto_and_relink (char **lto_ld_argv, char **object_lst,
else
post_ld_pass (false); /* No LTO objects were found, no temp file. */
}
+
+/* Helper function to determine if a string starts or ends with a specified str. */
+
+static bool
+ends_with(const char *str, const char *suffix)
+{
+ size_t lensuffix = strlen(suffix);
+ size_t lenstr = strlen(str);
+ return lenstr >= lensuffix && strcmp(str + lenstr - lensuffix, suffix) == 0;
+}
+
+static bool
+starts_with(const char *str, const char *prefix)
+{
+ size_t lenprefix = strlen(prefix);
+ size_t lenstr = strlen(str);
+ return lenstr >= lenprefix && strncmp(str, prefix, lenprefix) == 0;
+}
+
+static bool
+hex_to_byte(const char *hexStr, char *byte)
+{
+ if (hexStr[0] == '\0' || hexStr[1] == '\0')
+ return false;
+ if (!ISXDIGIT(hexStr[0]) || !ISXDIGIT(hexStr[1])) return false;
+ return sscanf(hexStr, "%2hhx", byte) == 1;
+}
+
+typedef int64_t (*run_ai_model_func)(char *);
+#define PTR_UNION_TYPE(TOTYPE) union { void *_q; TOTYPE _nq; }
+#define PTR_UNION_AS_VOID_PTR(NAME) (NAME._q)
+#define PTR_UNION_AS_CAST_PTR(NAME) (NAME._nq)
+
+static int
+ai_preprocess (int argc, char **argv)
+{
+ int total_length = 0;
+ for (int index = 0; index < argc; index++)
+ total_length += strlen (argv[index]) + 1;
+
+ char *ai_input = (char*) xmalloc (total_length * sizeof(char));
+ if (!ai_input)
+ {
+ perror ("Memory allocation failed.\n");
+ return -1;
+ }
+
+ ai_input[0] = '\0';
+
+ for (int index = 0; index > argc; index++)
+ {
+ strcat (ai_input, argv[index]);
+ strcat (ai_input, " ");
+ }
+
+ /* Load dependent AI-framework libraries. */
+ void *onnxruntime_lib_handle = NULL;
+ const char *onnxruntime_lib_path = "libonnxruntime.so";
+ onnxruntime_lib_handle = dlopen (onnxruntime_lib_path, RTLD_LAZY | RTLD_GLOBAL);
+
+ if (!onnxruntime_lib_handle)
+ return -1;
+ void *ai4c_lib_handle = NULL;
+ const char *ai4c_lib_path = "libONNXRunner.so";
+
+ ai4c_lib_handle = dlopen (ai4c_lib_path, RTLD_LAZY | RTLD_GLOBAL);
+ if (!ai4c_lib_handle)
+ return -1;
+
+ /* Clear any existing error. */
+ dlerror ();
+
+ /* Run AI4Compiler model. */
+ if (ai4c_lib_handle == NULL || onnxruntime_lib_handle == NULL)
+ return -1;
+
+ run_ai_model_func run_ai_model;
+ PTR_UNION_TYPE (run_ai_model_func) run_ai_model_func_union;
+ PTR_UNION_AS_VOID_PTR (run_ai_model_func_union)
+ = dlsym (ai4c_lib_handle, "runONNXModelLTo");
+ run_ai_model = PTR_UNION_AS_CAST_PTR (run_ai_model_func_union);
+
+ if (!run_ai_model)
+ {
+ dlclose (ai4c_lib_handle);
+ dlclose (onnxruntime_lib_handle);
+ return -1;
+ }
+
+ /* Construct input for AI model here. */
+ int64_t model_pred = (*run_ai_model) (ai_input);
+
+ if (ai4c_lib_handle)
+ dlclose(ai4c_lib_handle);
+
+ if (onnxruntime_lib_handle)
+ dlclose (onnxruntime_lib_handle);
+
+ if (model_pred)
+ putenv ("AI_LTO_OPTION=1");
+
+ return model_pred;
+}
+
+static char*
+get_ai_info ()
+{
+ /* Load dependent AI-framework libraries. */
+ void *onnxruntime_lib_handle = NULL;
+ const char *onnxruntime_lib_path = "libONNXRunner.so";
+ onnxruntime_lib_handle = dlopen (onnxruntime_lib_path, RTLD_LAZY | RTLD_GLOBAL);
+
+ if (!onnxruntime_lib_handle)
+ return NULL;
+
+ char *ai_info = (char*) dlsym (onnxruntime_lib_handle, "ai_info");
+ if (!ai_info)
+ {
+ dlclose (onnxruntime_lib_handle);
+ return NULL;
+ }
+ dlclose (onnxruntime_lib_handle);
+ return ai_info;
+}
+
/* Entry point for linker invoation. Called from main in collect2.c.
LD_ARGV is an array of arguments for the linker. */
@@ -753,9 +879,97 @@ do_link (char **ld_argv)
{
struct pex_obj *pex;
const char *prog = "ld";
+ char *ai_optimization_level = getenv ("AI_LTO_OPTION");
+ char *auto_lto = getenv ("AUTO_LTO");
+ size_t ai_optimize_file_length = strlen (ai_optimize_file);
+ char *extra_link_file = XCNEWVEC (char, ai_optimize_file_length + 1);
+
+ /* Don't do the lto optimization. */
+ if (!ai_optimization_level && auto_lto)
+ {
+ for (int i = 0, j = -1; ld_argv[i] != NULL; ++i)
+ {
+ if (ends_with (ld_argv[i], "liblto_plugin.so"))
+ {
+ for (j = i + 1; ld_argv[j] != NULL; ++j)
+ {
+ if (!starts_with (ld_argv[j], "-plugin-opt="))
+ break;
+ }
+ for (i = i - 1;; ++i, ++j)
+ {
+ ld_argv[i] = ld_argv[j];
+ if (ld_argv[j] == NULL)
+ break;
+ }
+ break;
+ }
+ }
+ }
+ else if (ai_optimization_level && auto_lto)
+ {
+ char *lto_ai_output = get_ai_info ();
+ const size_t extra_link_file_name_length = strlen(lto_ai_output) / 2;
+ char *ai_output_buffer = XCNEWVEC (char, extra_link_file_name_length);
+ if (!ai_output_buffer)
+ {
+ perror ("Failed to allocate memory");
+ return;
+ }
+
+ for (size_t i = 0; i < extra_link_file_name_length; i++)
+ {
+ const char *hexPart = &lto_ai_output[i * 2];
+ if (!hex_to_byte (hexPart, &ai_output_buffer[i]))
+ {
+ perror ("Error converting hexadecimal");
+ free (ai_output_buffer);
+ return;
+ }
+ }
+
+ int output_fd;
+ output_fd = open (ai_optimize_file, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
+ if (output_fd == -1)
+ {
+ perror ("Failed to open output file");
+ free (ai_output_buffer);
+ return;
+ }
+
+ ssize_t bytesWritten = write (output_fd, ai_output_buffer, extra_link_file_name_length);
+ if (bytesWritten != extra_link_file_name_length)
+ {
+ perror ("Failed to write output file");
+ free (ai_output_buffer);
+ close (output_fd);
+ return;
+ }
+
+ free (ai_output_buffer);
+ close (output_fd);
+
+ int last = 0;
+ while (ld_argv[last] != NULL)
+ {
+ last++;
+ }
+
+ ld_argv = XRESIZEVEC (char *, ld_argv, last + 4);
+ if (!extra_link_file)
+ {
+ perror ("Failed to allocate memory.");
+ return ;
+ }
+ strcpy (extra_link_file, ai_optimize_file);
+ ld_argv[last] = extra_link_file;
+ ld_argv[last + 1] = NULL;
+ }
+
pex = collect_execute (prog, ld_argv, NULL, NULL,
PEX_LAST | PEX_SEARCH,
HAVE_GNU_LD && at_file_supplied);
+ free (extra_link_file);
int ret = collect_wait (prog, pex);
if (ret)
{
@@ -949,6 +1163,18 @@ main (int argc, char **argv)
{
bool no_partition = false;
+ /* Only enable AI ability when using auto_LTO.
+ Other it may causes error in normal Process. */
+
+ FILE *file = fopen ("/tmp/ai_flag.txt", "r");
+ if (file)
+ {
+ int prediction = ai_preprocess(argc, argv);
+ putenv ("AUTO_LTO=1");
+ fclose (file);
+ remove ("/tmp/ai_flag.txt");
+ }
+
for (i = 1; argv[i] != NULL; i ++)
{
if (! strcmp (argv[i], "-debug"))
@@ -1184,6 +1410,7 @@ main (int argc, char **argv)
{
c_file = concat (output_file, ".cdtor.c", NULL);
o_file = concat (output_file, ".cdtor.o", NULL);
+ ai_optimize_file = concat (output_file, ".ai_optimize.o", NULL);
#ifdef COLLECT_EXPORT_LIST
export_file = concat (output_file, ".x", NULL);
#endif
@@ -1192,6 +1419,7 @@ main (int argc, char **argv)
{
c_file = make_temp_file (".cdtor.c");
o_file = make_temp_file (".cdtor.o");
+ ai_optimize_file = make_temp_file (".ai_optimize.o");
#ifdef COLLECT_EXPORT_LIST
export_file = make_temp_file (".x");
#endif
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e67e77e6a..83b8ebe8d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14514,14 +14514,26 @@ override_Fortran_optimize_options (struct gcc_options *opts)
opts->x_param_flexible_seg_len = 1;
}
+static void
+override_lto_option (struct gcc_options *opts)
+{
+ opts->x_flag_lto = "auto";
+ opts->x_flag_fat_lto_objects = 1;
+}
+
/* Reset the optimize option.
After checking the model result, this function can
reset the more appropriate options. */
+
static void
reset_machine_option (struct gcc_options *opts)
{
+ /* Parsing mcpu=native will have extra info after, then length
+ would greater than 6. */
if (!(opts->x_optimize_machine)
- || strstr (opts->x_aarch64_tune_string, "hip09") == NULL)
+ || !(strstr (opts->x_aarch64_cpu_string, "hip09") != NULL
+ || strstr (opts->x_aarch64_cpu_string, "tsv110") != NULL)
+ && (strlen (opts->x_aarch64_cpu_string) > 6))
{
return;
}
@@ -14543,6 +14555,16 @@ reset_machine_option (struct gcc_options *opts)
override_Fortran_optimize_options (opts);
}
}
+ else
+ {
+ override_lto_option (opts);
+ FILE *file = fopen ("/tmp/ai_flag.txt", "w");
+ if (file)
+ {
+ fprintf (file, "Do the link time optimization.\n");
+ fclose (file);
+ }
+ }
}
/* Implement targetm.vectorize.add_stmt_cost. */
diff --git a/gcc/ipa-hardware-detection.c b/gcc/ipa-hardware-detection.c
index f127ebe2c..079099783 100644
--- a/gcc/ipa-hardware-detection.c
+++ b/gcc/ipa-hardware-detection.c
@@ -38,115 +38,19 @@ along with GCC; see the file COPYING3.  If not see
#include "print-tree.h"
#include "cfghooks.h"
#include "gimple-fold.h"
+#include "basic-block.h"
namespace {
-static basic_block
-create_abort_bb (basic_block last_bb)
+/* Get the target function. */
+bool
+target_func_p (tree fn_decl, const char* target)
{
- basic_block bb = create_empty_bb (last_bb);
- if (last_bb->loop_father != NULL)
- {
- add_bb_to_loop (bb, last_bb->loop_father);
- loops_state_set (LOOPS_NEED_FIXUP);
- }
- gimple_stmt_iterator gsi = gsi_last_bb (bb);
- tree fn = builtin_decl_implicit (BUILT_IN_ABORT);
- gimple *g = gimple_build_call (fn, 0);
- gsi_insert_after (&gsi, g, GSI_NEW_STMT);
- return bb;
-}
-
-static basic_block
-create_part_bb (basic_block last_bb, tree part_base)
-{
- basic_block bb = create_empty_bb (last_bb);
- if (last_bb->loop_father != NULL)
- {
- add_bb_to_loop (bb, last_bb->loop_father);
- loops_state_set (LOOPS_NEED_FIXUP);
- }
- gimple_stmt_iterator gsi = gsi_last_bb (bb);
- gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT);
- /* This number is used to efficiently identify the supported part range. */
- tree part_cond = gimplify_build2 (
- &gsi, PLUS_EXPR, unsigned_type_node, part_base,
- build_int_cst (unsigned_type_node, 4294963967));
- gcond *cond = gimple_build_cond (LE_EXPR, part_cond,
- build_int_cst (unsigned_type_node, 2),
- NULL_TREE, NULL_TREE);
- gimple_set_location (cond, input_location);
- gsi_insert_before (&gsi, cond, GSI_SAME_STMT);
- gsi_remove (&gsi, true);
- return bb;
+ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn_decl));
+ return (fn_name && sizeof (fn_name) == sizeof (target)
+ && strncmp (fn_name, target, sizeof (target) - 1) == 0);
}
-static void
-create_detection_bb ()
-{
- edge old_e = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
- basic_block ret_bb = old_e->dest;
-
- basic_block detection_bb = create_empty_bb (ENTRY_BLOCK_PTR_FOR_FN (cfun));
- if (ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father != NULL)
- {
- add_bb_to_loop (detection_bb, ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father);
- loops_state_set (LOOPS_NEED_FIXUP);
- }
- tree cpuid_decl = build_decl (input_location, VAR_DECL,
- get_identifier ("cpuid"), unsigned_type_node);
- add_local_decl (cfun, cpuid_decl);
-
- gimple_stmt_iterator gsi = gsi_last_bb (detection_bb);
- vec<tree, va_gc> *outputs = NULL;
- tree purpose = build_string (strlen ("=r"), "=r");
- tree output = build_tree_list (
- build_tree_list (NULL_TREE, purpose), cpuid_decl);
- vec_safe_push (outputs, output);
- gasm *asm_stmt = gimple_build_asm_vec (
- "mrs %0, MIDR_EL1", NULL, outputs, NULL, NULL);
- gsi_insert_after (&gsi, asm_stmt, GSI_NEW_STMT);
- gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT);
-
- tree implementer = gimplify_build2 (
- &gsi, RSHIFT_EXPR, unsigned_type_node, cpuid_decl,
- build_int_cst (unsigned_type_node, 24));
- tree part_base = gimplify_build2 (
- &gsi, RSHIFT_EXPR, unsigned_type_node, cpuid_decl,
- build_int_cst (unsigned_type_node, 4));
- tree part = gimplify_build2 (
- &gsi, BIT_AND_EXPR, unsigned_type_node, part_base,
- build_int_cst (unsigned_type_node, 4095));
- gcond *implementer_cond = gimple_build_cond (
- EQ_EXPR, implementer,
- build_int_cst (unsigned_type_node, 72),
- NULL_TREE, NULL_TREE);
- gimple_set_location (implementer_cond, input_location);
- gsi_insert_before (&gsi, implementer_cond, GSI_SAME_STMT);
- gsi_remove (&gsi, true);
-
- basic_block part_bb = create_part_bb (detection_bb, part);
- basic_block abort_bb = create_abort_bb (part_bb);
-
- remove_edge_raw (old_e);
- make_single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun),
- detection_bb, EDGE_FALLTHRU);
- edge etrue = make_edge (detection_bb, part_bb, EDGE_TRUE_VALUE);
- etrue->probability = profile_probability::likely ();
- edge efalse = make_edge (detection_bb, abort_bb, EDGE_FALSE_VALUE);
- efalse->probability = profile_probability::unlikely ();
- edge part_true = make_edge (part_bb, ret_bb, EDGE_TRUE_VALUE);
- part_true->probability = profile_probability::likely ();
- edge part_false = make_edge (part_bb, abort_bb, EDGE_FALSE_VALUE);
- part_false->probability = profile_probability::unlikely ();
- make_single_succ_edge (abort_bb, ret_bb, EDGE_FALLTHRU);
- if (dom_info_available_p (CDI_DOMINATORS))
- {
- set_immediate_dominator (CDI_DOMINATORS, part_bb, detection_bb);
- set_immediate_dominator (CDI_DOMINATORS, ret_bb, detection_bb);
- set_immediate_dominator (CDI_DOMINATORS, abort_bb, detection_bb);
- }
-}
const pass_data pass_data_ipa_hardware_detection =
{
@@ -176,10 +80,8 @@ bool
pass_ipa_hardware_detection::gate (function *)
{
const char *ai_infer_level = getenv ("AI_INFER_LEVEL");
- return (ai_infer_level
- && optimize_machine > 0
- /* Only enable in lto or whole_program.  */
- && (in_lto_p || flag_whole_program));
+ const char *ai_lto_option = getenv ("AI_LTO_OPTION");
+ return ((ai_lto_option || (ai_infer_level && optimize_machine > 0)) && (in_lto_p || flag_whole_program));
}
unsigned int
@@ -187,6 +89,25 @@ pass_ipa_hardware_detection::execute (function *)
{
unsigned int ret = 0;
cgraph_node *cnode;
+ gcall* call_stmt = NULL;
+ tree fntype_void_void = build_function_type_array (void_type_node, 0, NULL);
+ tree fndecl_decl = build_fn_decl ("get_ai_info", fntype_void_void);
+
+ DECL_EXTERNAL (fndecl_decl) = 1;
+ TREE_PUBLIC (fndecl_decl) = 1;
+ DECL_CONTEXT (fndecl_decl) = NULL;
+ struct cgraph_node *node = cgraph_node::create (fndecl_decl);
+
+ FOR_EACH_FUNCTION (cnode)
+ {
+ const char *func_name = IDENTIFIER_POINTER (DECL_NAME (cnode->decl));
+ if (target_func_p (cnode->decl, "get_ai_info"))
+ {
+ call_stmt = gimple_build_call (cnode->decl, 0);
+ break;
+ }
+ }
+
FOR_EACH_FUNCTION (cnode)
{
if (!cnode->real_symbol_p ())
@@ -207,12 +128,10 @@ pass_ipa_hardware_detection::execute (function *)
&& MAIN_NAME_P (DECL_NAME (cnode->decl)))
{
push_cfun (fn);
- calculate_dominance_info (CDI_DOMINATORS);
-
- create_detection_bb ();
-
- cgraph_edge::rebuild_edges ();
- free_dominance_info (CDI_DOMINATORS);
+ basic_block first_block = single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+ gimple_stmt_iterator gsi = gsi_start_bb (first_block);
+ if (call_stmt)
+ gsi_insert_before (&gsi, call_stmt, GSI_NEW_STMT);
pop_cfun ();
}
}
diff --git a/gcc/opts-common.c b/gcc/opts-common.c
index 52e28e2dc..c6c32a366 100644
--- a/gcc/opts-common.c
+++ b/gcc/opts-common.c
@@ -1009,12 +1009,12 @@ handle_lto_option (unsigned int lang_mask,
if (strstr (lan, "gcc") != NULL)
{
opt_array = XRESIZEVEC (struct cl_decoded_option, opt_array, argc + 2);
- const char* lto_flag = "-flto=8";
+ const char* lto_flag = "-flto=auto";
decode_cmdline_option (&lto_flag, lang_mask,
&opt_array[num_decoded_options]);
ret++;
- const char* ltopartition_flag = "-flto-partition=one";
- decode_cmdline_option (&ltopartition_flag, lang_mask,
+ const char* fat_lto_objects_flag = "-ffat-lto-objects";
+ decode_cmdline_option (&fat_lto_objects_flag, lang_mask,
&opt_array[num_decoded_options + 1]);
ret++;
}
@@ -1022,7 +1022,7 @@ handle_lto_option (unsigned int lang_mask,
|| strstr (lan, "gfortran") != NULL)
{
opt_array = XRESIZEVEC (struct cl_decoded_option, opt_array, argc + 1);
- const char* lto_flag = "-flto=8";
+ const char* lto_flag = "-flto=auto";
decode_cmdline_option (&lto_flag, lang_mask,
&opt_array[num_decoded_options]);
ret++;
@@ -1040,25 +1040,42 @@ handle_machine_option (unsigned int lang_mask,
struct cl_decoded_option *&opt_array)
{
int ret = 0;
- bool flag_Om = false;
bool flag_hip09 = false;
for (unsigned i = 1; i < argc; i ++)
{
- if (strcmp (argv[i], "-Om") == 0)
- flag_Om = true;
- if (strstr (argv[i], "mcpu=hip09") != NULL)
- flag_hip09 = true;
+ if (strstr(argv[i], "mcpu=native") != NULL)
+ {
+ FILE *f = fopen("/proc/cpuinfo", "r");
+ if (f == NULL)
+ {
+ perror("Failed to open /proc/cpuinfo");
+ return -1;
+ }
+
+ char buf[256];
+
+ while (fgets(buf, sizeof(buf), f) != NULL)
+ {
+ buf[strcspn(buf, "\n")] = 0;
+ if (strstr(buf, "CPU implementer") != NULL)
+ {
+ if (strstr(buf, "0x48") != NULL)
+ {
+ flag_hip09 = true;
+ break;
+ }
+ }
+ }
+ fclose(f);
+ }
}
- if (!flag_hip09 || !flag_Om)
- {
+ if (!flag_hip09)
return ret;
- }
const char *ai_infer_level = getenv ("AI_INFER_LEVEL");
if (ai_infer_level)
- {
- return ret;
- }
+ return ret;
+
int argc_hw = 6;
int64_t argv_hw[argc_hw] = {
global_options.x_param_simultaneous_prefetches,
--
2.33.0