gcc/0207-Use-AI-ability-to-enable-Link-Time-Optimization.patch

617 lines
20 KiB
Diff
Raw Normal View History

From 8db421d94ad808c51c86514d7170c97e7704fd6d Mon Sep 17 00:00:00 2001
From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com>
Date: Mon, 23 Sep 2024 19:14:39 +0800
Subject: [PATCH] Use AI ability to enable Link Time Optimization.
---
gcc/collect2.c | 230 ++++++++++++++++++++++++++++++++++-
gcc/config/aarch64/aarch64.c | 24 +++-
gcc/ipa-hardware-detection.c | 145 +++++-----------------
gcc/opts-common.c | 47 ++++---
4 files changed, 316 insertions(+), 130 deletions(-)
diff --git a/gcc/collect2.c b/gcc/collect2.c
index f8a5ce459..d4b6a1849 100644
--- a/gcc/collect2.c
+++ b/gcc/collect2.c
@@ -51,7 +51,7 @@ along with GCC; see the file COPYING3. If not see
#include "obstack.h"
#include "intl.h"
#include "version.h"
-
+
/* On certain systems, we have code that works by scanning the object file
directly. But this code uses system-specific header files and library
functions, so turn it off in a cross-compiler. Likewise, the names of
@@ -207,6 +207,7 @@ static int static_obj; /* true if -static */
static const char *c_file; /* <xxx>.c for constructor/destructor list. */
static const char *o_file; /* <xxx>.o for constructor/destructor list. */
+static const char *ai_optimize_file; /* <xxx>.o for ai optimization file. */
#ifdef COLLECT_EXPORT_LIST
static const char *export_file; /* <xxx>.x for AIX export list. */
#endif
@@ -745,6 +746,131 @@ maybe_run_lto_and_relink (char **lto_ld_argv, char **object_lst,
else
post_ld_pass (false); /* No LTO objects were found, no temp file. */
}
+
+/* Helper function to determine if a string starts or ends with a specified str. */
+
+static bool
+ends_with(const char *str, const char *suffix)
+{
+ size_t lensuffix = strlen(suffix);
+ size_t lenstr = strlen(str);
+ return lenstr >= lensuffix && strcmp(str + lenstr - lensuffix, suffix) == 0;
+}
+
+static bool
+starts_with(const char *str, const char *prefix)
+{
+ size_t lenprefix = strlen(prefix);
+ size_t lenstr = strlen(str);
+ return lenstr >= lenprefix && strncmp(str, prefix, lenprefix) == 0;
+}
+
+static bool
+hex_to_byte(const char *hexStr, char *byte)
+{
+ if (hexStr[0] == '\0' || hexStr[1] == '\0')
+ return false;
+ if (!ISXDIGIT(hexStr[0]) || !ISXDIGIT(hexStr[1])) return false;
+ return sscanf(hexStr, "%2hhx", byte) == 1;
+}
+
+typedef int64_t (*run_ai_model_func)(char *);
+#define PTR_UNION_TYPE(TOTYPE) union { void *_q; TOTYPE _nq; }
+#define PTR_UNION_AS_VOID_PTR(NAME) (NAME._q)
+#define PTR_UNION_AS_CAST_PTR(NAME) (NAME._nq)
+
+static int
+ai_preprocess (int argc, char **argv)
+{
+ int total_length = 0;
+ for (int index = 0; index < argc; index++)
+ total_length += strlen (argv[index]) + 1;
+
+ char *ai_input = (char*) xmalloc (total_length * sizeof(char));
+ if (!ai_input)
+ {
+ perror ("Memory allocation failed.\n");
+ return -1;
+ }
+
+ ai_input[0] = '\0';
+
+ for (int index = 0; index > argc; index++)
+ {
+ strcat (ai_input, argv[index]);
+ strcat (ai_input, " ");
+ }
+
+ /* Load dependent AI-framework libraries. */
+ void *onnxruntime_lib_handle = NULL;
+ const char *onnxruntime_lib_path = "libonnxruntime.so";
+ onnxruntime_lib_handle = dlopen (onnxruntime_lib_path, RTLD_LAZY | RTLD_GLOBAL);
+
+ if (!onnxruntime_lib_handle)
+ return -1;
+ void *ai4c_lib_handle = NULL;
+ const char *ai4c_lib_path = "libONNXRunner.so";
+
+ ai4c_lib_handle = dlopen (ai4c_lib_path, RTLD_LAZY | RTLD_GLOBAL);
+ if (!ai4c_lib_handle)
+ return -1;
+
+ /* Clear any existing error. */
+ dlerror ();
+
+ /* Run AI4Compiler model. */
+ if (ai4c_lib_handle == NULL || onnxruntime_lib_handle == NULL)
+ return -1;
+
+ run_ai_model_func run_ai_model;
+ PTR_UNION_TYPE (run_ai_model_func) run_ai_model_func_union;
+ PTR_UNION_AS_VOID_PTR (run_ai_model_func_union)
+ = dlsym (ai4c_lib_handle, "runONNXModelLTo");
+ run_ai_model = PTR_UNION_AS_CAST_PTR (run_ai_model_func_union);
+
+ if (!run_ai_model)
+ {
+ dlclose (ai4c_lib_handle);
+ dlclose (onnxruntime_lib_handle);
+ return -1;
+ }
+
+ /* Construct input for AI model here. */
+ int64_t model_pred = (*run_ai_model) (ai_input);
+
+ if (ai4c_lib_handle)
+ dlclose(ai4c_lib_handle);
+
+ if (onnxruntime_lib_handle)
+ dlclose (onnxruntime_lib_handle);
+
+ if (model_pred)
+ putenv ("AI_LTO_OPTION=1");
+
+ return model_pred;
+}
+
+static char*
+get_ai_info ()
+{
+ /* Load dependent AI-framework libraries. */
+ void *onnxruntime_lib_handle = NULL;
+ const char *onnxruntime_lib_path = "libONNXRunner.so";
+ onnxruntime_lib_handle = dlopen (onnxruntime_lib_path, RTLD_LAZY | RTLD_GLOBAL);
+
+ if (!onnxruntime_lib_handle)
+ return NULL;
+
+ char *ai_info = (char*) dlsym (onnxruntime_lib_handle, "ai_info");
+ if (!ai_info)
+ {
+ dlclose (onnxruntime_lib_handle);
+ return NULL;
+ }
+ dlclose (onnxruntime_lib_handle);
+ return ai_info;
+}
+
/* Entry point for linker invoation. Called from main in collect2.c.
LD_ARGV is an array of arguments for the linker. */
@@ -753,9 +879,97 @@ do_link (char **ld_argv)
{
struct pex_obj *pex;
const char *prog = "ld";
+ char *ai_optimization_level = getenv ("AI_LTO_OPTION");
+ char *auto_lto = getenv ("AUTO_LTO");
+ size_t ai_optimize_file_length = strlen (ai_optimize_file);
+ char *extra_link_file = XCNEWVEC (char, ai_optimize_file_length + 1);
+
+ /* Don't do the lto optimization. */
+ if (!ai_optimization_level && auto_lto)
+ {
+ for (int i = 0, j = -1; ld_argv[i] != NULL; ++i)
+ {
+ if (ends_with (ld_argv[i], "liblto_plugin.so"))
+ {
+ for (j = i + 1; ld_argv[j] != NULL; ++j)
+ {
+ if (!starts_with (ld_argv[j], "-plugin-opt="))
+ break;
+ }
+ for (i = i - 1;; ++i, ++j)
+ {
+ ld_argv[i] = ld_argv[j];
+ if (ld_argv[j] == NULL)
+ break;
+ }
+ break;
+ }
+ }
+ }
+ else if (ai_optimization_level && auto_lto)
+ {
+ char *lto_ai_output = get_ai_info ();
+ const size_t extra_link_file_name_length = strlen(lto_ai_output) / 2;
+ char *ai_output_buffer = XCNEWVEC (char, extra_link_file_name_length);
+ if (!ai_output_buffer)
+ {
+ perror ("Failed to allocate memory");
+ return;
+ }
+
+ for (size_t i = 0; i < extra_link_file_name_length; i++)
+ {
+ const char *hexPart = &lto_ai_output[i * 2];
+ if (!hex_to_byte (hexPart, &ai_output_buffer[i]))
+ {
+ perror ("Error converting hexadecimal");
+ free (ai_output_buffer);
+ return;
+ }
+ }
+
+ int output_fd;
+ output_fd = open (ai_optimize_file, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
+ if (output_fd == -1)
+ {
+ perror ("Failed to open output file");
+ free (ai_output_buffer);
+ return;
+ }
+
+ ssize_t bytesWritten = write (output_fd, ai_output_buffer, extra_link_file_name_length);
+ if (bytesWritten != extra_link_file_name_length)
+ {
+ perror ("Failed to write output file");
+ free (ai_output_buffer);
+ close (output_fd);
+ return;
+ }
+
+ free (ai_output_buffer);
+ close (output_fd);
+
+ int last = 0;
+ while (ld_argv[last] != NULL)
+ {
+ last++;
+ }
+
+ ld_argv = XRESIZEVEC (char *, ld_argv, last + 4);
+ if (!extra_link_file)
+ {
+ perror ("Failed to allocate memory.");
+ return ;
+ }
+ strcpy (extra_link_file, ai_optimize_file);
+ ld_argv[last] = extra_link_file;
+ ld_argv[last + 1] = NULL;
+ }
+
pex = collect_execute (prog, ld_argv, NULL, NULL,
PEX_LAST | PEX_SEARCH,
HAVE_GNU_LD && at_file_supplied);
+ free (extra_link_file);
int ret = collect_wait (prog, pex);
if (ret)
{
@@ -949,6 +1163,18 @@ main (int argc, char **argv)
{
bool no_partition = false;
+ /* Only enable AI ability when using auto_LTO.
+ Other it may causes error in normal Process. */
+
+ FILE *file = fopen ("/tmp/ai_flag.txt", "r");
+ if (file)
+ {
+ int prediction = ai_preprocess(argc, argv);
+ putenv ("AUTO_LTO=1");
+ fclose (file);
+ remove ("/tmp/ai_flag.txt");
+ }
+
for (i = 1; argv[i] != NULL; i ++)
{
if (! strcmp (argv[i], "-debug"))
@@ -1184,6 +1410,7 @@ main (int argc, char **argv)
{
c_file = concat (output_file, ".cdtor.c", NULL);
o_file = concat (output_file, ".cdtor.o", NULL);
+ ai_optimize_file = concat (output_file, ".ai_optimize.o", NULL);
#ifdef COLLECT_EXPORT_LIST
export_file = concat (output_file, ".x", NULL);
#endif
@@ -1192,6 +1419,7 @@ main (int argc, char **argv)
{
c_file = make_temp_file (".cdtor.c");
o_file = make_temp_file (".cdtor.o");
+ ai_optimize_file = make_temp_file (".ai_optimize.o");
#ifdef COLLECT_EXPORT_LIST
export_file = make_temp_file (".x");
#endif
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e67e77e6a..83b8ebe8d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14514,14 +14514,26 @@ override_Fortran_optimize_options (struct gcc_options *opts)
opts->x_param_flexible_seg_len = 1;
}
+static void
+override_lto_option (struct gcc_options *opts)
+{
+ opts->x_flag_lto = "auto";
+ opts->x_flag_fat_lto_objects = 1;
+}
+
/* Reset the optimize option.
After checking the model result, this function can
reset the more appropriate options. */
+
static void
reset_machine_option (struct gcc_options *opts)
{
+ /* Parsing mcpu=native will have extra info after, then length
+ would greater than 6. */
if (!(opts->x_optimize_machine)
- || strstr (opts->x_aarch64_tune_string, "hip09") == NULL)
+ || !(strstr (opts->x_aarch64_cpu_string, "hip09") != NULL
+ || strstr (opts->x_aarch64_cpu_string, "tsv110") != NULL)
+ && (strlen (opts->x_aarch64_cpu_string) > 6))
{
return;
}
@@ -14543,6 +14555,16 @@ reset_machine_option (struct gcc_options *opts)
override_Fortran_optimize_options (opts);
}
}
+ else
+ {
+ override_lto_option (opts);
+ FILE *file = fopen ("/tmp/ai_flag.txt", "w");
+ if (file)
+ {
+ fprintf (file, "Do the link time optimization.\n");
+ fclose (file);
+ }
+ }
}
/* Implement targetm.vectorize.add_stmt_cost. */
diff --git a/gcc/ipa-hardware-detection.c b/gcc/ipa-hardware-detection.c
index f127ebe2c..079099783 100644
--- a/gcc/ipa-hardware-detection.c
+++ b/gcc/ipa-hardware-detection.c
@@ -38,115 +38,19 @@ along with GCC; see the file COPYING3.  If not see
#include "print-tree.h"
#include "cfghooks.h"
#include "gimple-fold.h"
+#include "basic-block.h"
namespace {
-static basic_block
-create_abort_bb (basic_block last_bb)
+/* Get the target function. */
+bool
+target_func_p (tree fn_decl, const char* target)
{
- basic_block bb = create_empty_bb (last_bb);
- if (last_bb->loop_father != NULL)
- {
- add_bb_to_loop (bb, last_bb->loop_father);
- loops_state_set (LOOPS_NEED_FIXUP);
- }
- gimple_stmt_iterator gsi = gsi_last_bb (bb);
- tree fn = builtin_decl_implicit (BUILT_IN_ABORT);
- gimple *g = gimple_build_call (fn, 0);
- gsi_insert_after (&gsi, g, GSI_NEW_STMT);
- return bb;
-}
-
-static basic_block
-create_part_bb (basic_block last_bb, tree part_base)
-{
- basic_block bb = create_empty_bb (last_bb);
- if (last_bb->loop_father != NULL)
- {
- add_bb_to_loop (bb, last_bb->loop_father);
- loops_state_set (LOOPS_NEED_FIXUP);
- }
- gimple_stmt_iterator gsi = gsi_last_bb (bb);
- gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT);
- /* This number is used to efficiently identify the supported part range. */
- tree part_cond = gimplify_build2 (
- &gsi, PLUS_EXPR, unsigned_type_node, part_base,
- build_int_cst (unsigned_type_node, 4294963967));
- gcond *cond = gimple_build_cond (LE_EXPR, part_cond,
- build_int_cst (unsigned_type_node, 2),
- NULL_TREE, NULL_TREE);
- gimple_set_location (cond, input_location);
- gsi_insert_before (&gsi, cond, GSI_SAME_STMT);
- gsi_remove (&gsi, true);
- return bb;
+ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn_decl));
+ return (fn_name && sizeof (fn_name) == sizeof (target)
+ && strncmp (fn_name, target, sizeof (target) - 1) == 0);
}
-static void
-create_detection_bb ()
-{
- edge old_e = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
- basic_block ret_bb = old_e->dest;
-
- basic_block detection_bb = create_empty_bb (ENTRY_BLOCK_PTR_FOR_FN (cfun));
- if (ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father != NULL)
- {
- add_bb_to_loop (detection_bb, ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father);
- loops_state_set (LOOPS_NEED_FIXUP);
- }
- tree cpuid_decl = build_decl (input_location, VAR_DECL,
- get_identifier ("cpuid"), unsigned_type_node);
- add_local_decl (cfun, cpuid_decl);
-
- gimple_stmt_iterator gsi = gsi_last_bb (detection_bb);
- vec<tree, va_gc> *outputs = NULL;
- tree purpose = build_string (strlen ("=r"), "=r");
- tree output = build_tree_list (
- build_tree_list (NULL_TREE, purpose), cpuid_decl);
- vec_safe_push (outputs, output);
- gasm *asm_stmt = gimple_build_asm_vec (
- "mrs %0, MIDR_EL1", NULL, outputs, NULL, NULL);
- gsi_insert_after (&gsi, asm_stmt, GSI_NEW_STMT);
- gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT);
-
- tree implementer = gimplify_build2 (
- &gsi, RSHIFT_EXPR, unsigned_type_node, cpuid_decl,
- build_int_cst (unsigned_type_node, 24));
- tree part_base = gimplify_build2 (
- &gsi, RSHIFT_EXPR, unsigned_type_node, cpuid_decl,
- build_int_cst (unsigned_type_node, 4));
- tree part = gimplify_build2 (
- &gsi, BIT_AND_EXPR, unsigned_type_node, part_base,
- build_int_cst (unsigned_type_node, 4095));
- gcond *implementer_cond = gimple_build_cond (
- EQ_EXPR, implementer,
- build_int_cst (unsigned_type_node, 72),
- NULL_TREE, NULL_TREE);
- gimple_set_location (implementer_cond, input_location);
- gsi_insert_before (&gsi, implementer_cond, GSI_SAME_STMT);
- gsi_remove (&gsi, true);
-
- basic_block part_bb = create_part_bb (detection_bb, part);
- basic_block abort_bb = create_abort_bb (part_bb);
-
- remove_edge_raw (old_e);
- make_single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun),
- detection_bb, EDGE_FALLTHRU);
- edge etrue = make_edge (detection_bb, part_bb, EDGE_TRUE_VALUE);
- etrue->probability = profile_probability::likely ();
- edge efalse = make_edge (detection_bb, abort_bb, EDGE_FALSE_VALUE);
- efalse->probability = profile_probability::unlikely ();
- edge part_true = make_edge (part_bb, ret_bb, EDGE_TRUE_VALUE);
- part_true->probability = profile_probability::likely ();
- edge part_false = make_edge (part_bb, abort_bb, EDGE_FALSE_VALUE);
- part_false->probability = profile_probability::unlikely ();
- make_single_succ_edge (abort_bb, ret_bb, EDGE_FALLTHRU);
- if (dom_info_available_p (CDI_DOMINATORS))
- {
- set_immediate_dominator (CDI_DOMINATORS, part_bb, detection_bb);
- set_immediate_dominator (CDI_DOMINATORS, ret_bb, detection_bb);
- set_immediate_dominator (CDI_DOMINATORS, abort_bb, detection_bb);
- }
-}
const pass_data pass_data_ipa_hardware_detection =
{
@@ -176,10 +80,8 @@ bool
pass_ipa_hardware_detection::gate (function *)
{
const char *ai_infer_level = getenv ("AI_INFER_LEVEL");
- return (ai_infer_level
- && optimize_machine > 0
- /* Only enable in lto or whole_program.  */
- && (in_lto_p || flag_whole_program));
+ const char *ai_lto_option = getenv ("AI_LTO_OPTION");
+ return ((ai_lto_option || (ai_infer_level && optimize_machine > 0)) && (in_lto_p || flag_whole_program));
}
unsigned int
@@ -187,6 +89,25 @@ pass_ipa_hardware_detection::execute (function *)
{
unsigned int ret = 0;
cgraph_node *cnode;
+ gcall* call_stmt = NULL;
+ tree fntype_void_void = build_function_type_array (void_type_node, 0, NULL);
+ tree fndecl_decl = build_fn_decl ("get_ai_info", fntype_void_void);
+
+ DECL_EXTERNAL (fndecl_decl) = 1;
+ TREE_PUBLIC (fndecl_decl) = 1;
+ DECL_CONTEXT (fndecl_decl) = NULL;
+ struct cgraph_node *node = cgraph_node::create (fndecl_decl);
+
+ FOR_EACH_FUNCTION (cnode)
+ {
+ const char *func_name = IDENTIFIER_POINTER (DECL_NAME (cnode->decl));
+ if (target_func_p (cnode->decl, "get_ai_info"))
+ {
+ call_stmt = gimple_build_call (cnode->decl, 0);
+ break;
+ }
+ }
+
FOR_EACH_FUNCTION (cnode)
{
if (!cnode->real_symbol_p ())
@@ -207,12 +128,10 @@ pass_ipa_hardware_detection::execute (function *)
&& MAIN_NAME_P (DECL_NAME (cnode->decl)))
{
push_cfun (fn);
- calculate_dominance_info (CDI_DOMINATORS);
-
- create_detection_bb ();
-
- cgraph_edge::rebuild_edges ();
- free_dominance_info (CDI_DOMINATORS);
+ basic_block first_block = single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+ gimple_stmt_iterator gsi = gsi_start_bb (first_block);
+ if (call_stmt)
+ gsi_insert_before (&gsi, call_stmt, GSI_NEW_STMT);
pop_cfun ();
}
}
diff --git a/gcc/opts-common.c b/gcc/opts-common.c
index 52e28e2dc..c6c32a366 100644
--- a/gcc/opts-common.c
+++ b/gcc/opts-common.c
@@ -1009,12 +1009,12 @@ handle_lto_option (unsigned int lang_mask,
if (strstr (lan, "gcc") != NULL)
{
opt_array = XRESIZEVEC (struct cl_decoded_option, opt_array, argc + 2);
- const char* lto_flag = "-flto=8";
+ const char* lto_flag = "-flto=auto";
decode_cmdline_option (&lto_flag, lang_mask,
&opt_array[num_decoded_options]);
ret++;
- const char* ltopartition_flag = "-flto-partition=one";
- decode_cmdline_option (&ltopartition_flag, lang_mask,
+ const char* fat_lto_objects_flag = "-ffat-lto-objects";
+ decode_cmdline_option (&fat_lto_objects_flag, lang_mask,
&opt_array[num_decoded_options + 1]);
ret++;
}
@@ -1022,7 +1022,7 @@ handle_lto_option (unsigned int lang_mask,
|| strstr (lan, "gfortran") != NULL)
{
opt_array = XRESIZEVEC (struct cl_decoded_option, opt_array, argc + 1);
- const char* lto_flag = "-flto=8";
+ const char* lto_flag = "-flto=auto";
decode_cmdline_option (&lto_flag, lang_mask,
&opt_array[num_decoded_options]);
ret++;
@@ -1040,25 +1040,42 @@ handle_machine_option (unsigned int lang_mask,
struct cl_decoded_option *&opt_array)
{
int ret = 0;
- bool flag_Om = false;
bool flag_hip09 = false;
for (unsigned i = 1; i < argc; i ++)
{
- if (strcmp (argv[i], "-Om") == 0)
- flag_Om = true;
- if (strstr (argv[i], "mcpu=hip09") != NULL)
- flag_hip09 = true;
+ if (strstr(argv[i], "mcpu=native") != NULL)
+ {
+ FILE *f = fopen("/proc/cpuinfo", "r");
+ if (f == NULL)
+ {
+ perror("Failed to open /proc/cpuinfo");
+ return -1;
+ }
+
+ char buf[256];
+
+ while (fgets(buf, sizeof(buf), f) != NULL)
+ {
+ buf[strcspn(buf, "\n")] = 0;
+ if (strstr(buf, "CPU implementer") != NULL)
+ {
+ if (strstr(buf, "0x48") != NULL)
+ {
+ flag_hip09 = true;
+ break;
+ }
+ }
+ }
+ fclose(f);
+ }
}
- if (!flag_hip09 || !flag_Om)
- {
+ if (!flag_hip09)
return ret;
- }
const char *ai_infer_level = getenv ("AI_INFER_LEVEL");
if (ai_infer_level)
- {
- return ret;
- }
+ return ret;
+
int argc_hw = 6;
int64_t argv_hw[argc_hw] = {
global_options.x_param_simultaneous_prefetches,
--
2.33.0