From 8db421d94ad808c51c86514d7170c97e7704fd6d Mon Sep 17 00:00:00 2001 From: zhenyu--zhao_admin Date: Mon, 23 Sep 2024 19:14:39 +0800 Subject: [PATCH] Use AI ability to enable Link Time Optimization. --- gcc/collect2.c | 230 ++++++++++++++++++++++++++++++++++- gcc/config/aarch64/aarch64.c | 24 +++- gcc/ipa-hardware-detection.c | 145 +++++----------------- gcc/opts-common.c | 47 ++++--- 4 files changed, 316 insertions(+), 130 deletions(-) diff --git a/gcc/collect2.c b/gcc/collect2.c index f8a5ce459..d4b6a1849 100644 --- a/gcc/collect2.c +++ b/gcc/collect2.c @@ -51,7 +51,7 @@ along with GCC; see the file COPYING3. If not see #include "obstack.h" #include "intl.h" #include "version.h" - + /* On certain systems, we have code that works by scanning the object file directly. But this code uses system-specific header files and library functions, so turn it off in a cross-compiler. Likewise, the names of @@ -207,6 +207,7 @@ static int static_obj; /* true if -static */ static const char *c_file; /* .c for constructor/destructor list. */ static const char *o_file; /* .o for constructor/destructor list. */ +static const char *ai_optimize_file; /* .o for ai optimization file. */ #ifdef COLLECT_EXPORT_LIST static const char *export_file; /* .x for AIX export list. */ #endif @@ -745,6 +746,131 @@ maybe_run_lto_and_relink (char **lto_ld_argv, char **object_lst, else post_ld_pass (false); /* No LTO objects were found, no temp file. */ } + +/* Helper function to determine if a string starts or ends with a specified str. */ + +static bool +ends_with(const char *str, const char *suffix) +{ + size_t lensuffix = strlen(suffix); + size_t lenstr = strlen(str); + return lenstr >= lensuffix && strcmp(str + lenstr - lensuffix, suffix) == 0; +} + +static bool +starts_with(const char *str, const char *prefix) +{ + size_t lenprefix = strlen(prefix); + size_t lenstr = strlen(str); + return lenstr >= lenprefix && strncmp(str, prefix, lenprefix) == 0; +} + +static bool +hex_to_byte(const char *hexStr, char *byte) +{ + if (hexStr[0] == '\0' || hexStr[1] == '\0') + return false; + if (!ISXDIGIT(hexStr[0]) || !ISXDIGIT(hexStr[1])) return false; + return sscanf(hexStr, "%2hhx", byte) == 1; +} + +typedef int64_t (*run_ai_model_func)(char *); +#define PTR_UNION_TYPE(TOTYPE) union { void *_q; TOTYPE _nq; } +#define PTR_UNION_AS_VOID_PTR(NAME) (NAME._q) +#define PTR_UNION_AS_CAST_PTR(NAME) (NAME._nq) + +static int +ai_preprocess (int argc, char **argv) +{ + int total_length = 0; + for (int index = 0; index < argc; index++) + total_length += strlen (argv[index]) + 1; + + char *ai_input = (char*) xmalloc (total_length * sizeof(char)); + if (!ai_input) + { + perror ("Memory allocation failed.\n"); + return -1; + } + + ai_input[0] = '\0'; + + for (int index = 0; index > argc; index++) + { + strcat (ai_input, argv[index]); + strcat (ai_input, " "); + } + + /* Load dependent AI-framework libraries. */ + void *onnxruntime_lib_handle = NULL; + const char *onnxruntime_lib_path = "libonnxruntime.so"; + onnxruntime_lib_handle = dlopen (onnxruntime_lib_path, RTLD_LAZY | RTLD_GLOBAL); + + if (!onnxruntime_lib_handle) + return -1; + void *ai4c_lib_handle = NULL; + const char *ai4c_lib_path = "libONNXRunner.so"; + + ai4c_lib_handle = dlopen (ai4c_lib_path, RTLD_LAZY | RTLD_GLOBAL); + if (!ai4c_lib_handle) + return -1; + + /* Clear any existing error. */ + dlerror (); + + /* Run AI4Compiler model. */ + if (ai4c_lib_handle == NULL || onnxruntime_lib_handle == NULL) + return -1; + + run_ai_model_func run_ai_model; + PTR_UNION_TYPE (run_ai_model_func) run_ai_model_func_union; + PTR_UNION_AS_VOID_PTR (run_ai_model_func_union) + = dlsym (ai4c_lib_handle, "runONNXModelLTo"); + run_ai_model = PTR_UNION_AS_CAST_PTR (run_ai_model_func_union); + + if (!run_ai_model) + { + dlclose (ai4c_lib_handle); + dlclose (onnxruntime_lib_handle); + return -1; + } + + /* Construct input for AI model here. */ + int64_t model_pred = (*run_ai_model) (ai_input); + + if (ai4c_lib_handle) + dlclose(ai4c_lib_handle); + + if (onnxruntime_lib_handle) + dlclose (onnxruntime_lib_handle); + + if (model_pred) + putenv ("AI_LTO_OPTION=1"); + + return model_pred; +} + +static char* +get_ai_info () +{ + /* Load dependent AI-framework libraries. */ + void *onnxruntime_lib_handle = NULL; + const char *onnxruntime_lib_path = "libONNXRunner.so"; + onnxruntime_lib_handle = dlopen (onnxruntime_lib_path, RTLD_LAZY | RTLD_GLOBAL); + + if (!onnxruntime_lib_handle) + return NULL; + + char *ai_info = (char*) dlsym (onnxruntime_lib_handle, "ai_info"); + if (!ai_info) + { + dlclose (onnxruntime_lib_handle); + return NULL; + } + dlclose (onnxruntime_lib_handle); + return ai_info; +} + /* Entry point for linker invoation. Called from main in collect2.c. LD_ARGV is an array of arguments for the linker. */ @@ -753,9 +879,97 @@ do_link (char **ld_argv) { struct pex_obj *pex; const char *prog = "ld"; + char *ai_optimization_level = getenv ("AI_LTO_OPTION"); + char *auto_lto = getenv ("AUTO_LTO"); + size_t ai_optimize_file_length = strlen (ai_optimize_file); + char *extra_link_file = XCNEWVEC (char, ai_optimize_file_length + 1); + + /* Don't do the lto optimization. */ + if (!ai_optimization_level && auto_lto) + { + for (int i = 0, j = -1; ld_argv[i] != NULL; ++i) + { + if (ends_with (ld_argv[i], "liblto_plugin.so")) + { + for (j = i + 1; ld_argv[j] != NULL; ++j) + { + if (!starts_with (ld_argv[j], "-plugin-opt=")) + break; + } + for (i = i - 1;; ++i, ++j) + { + ld_argv[i] = ld_argv[j]; + if (ld_argv[j] == NULL) + break; + } + break; + } + } + } + else if (ai_optimization_level && auto_lto) + { + char *lto_ai_output = get_ai_info (); + const size_t extra_link_file_name_length = strlen(lto_ai_output) / 2; + char *ai_output_buffer = XCNEWVEC (char, extra_link_file_name_length); + if (!ai_output_buffer) + { + perror ("Failed to allocate memory"); + return; + } + + for (size_t i = 0; i < extra_link_file_name_length; i++) + { + const char *hexPart = <o_ai_output[i * 2]; + if (!hex_to_byte (hexPart, &ai_output_buffer[i])) + { + perror ("Error converting hexadecimal"); + free (ai_output_buffer); + return; + } + } + + int output_fd; + output_fd = open (ai_optimize_file, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (output_fd == -1) + { + perror ("Failed to open output file"); + free (ai_output_buffer); + return; + } + + ssize_t bytesWritten = write (output_fd, ai_output_buffer, extra_link_file_name_length); + if (bytesWritten != extra_link_file_name_length) + { + perror ("Failed to write output file"); + free (ai_output_buffer); + close (output_fd); + return; + } + + free (ai_output_buffer); + close (output_fd); + + int last = 0; + while (ld_argv[last] != NULL) + { + last++; + } + + ld_argv = XRESIZEVEC (char *, ld_argv, last + 4); + if (!extra_link_file) + { + perror ("Failed to allocate memory."); + return ; + } + strcpy (extra_link_file, ai_optimize_file); + ld_argv[last] = extra_link_file; + ld_argv[last + 1] = NULL; + } + pex = collect_execute (prog, ld_argv, NULL, NULL, PEX_LAST | PEX_SEARCH, HAVE_GNU_LD && at_file_supplied); + free (extra_link_file); int ret = collect_wait (prog, pex); if (ret) { @@ -949,6 +1163,18 @@ main (int argc, char **argv) { bool no_partition = false; + /* Only enable AI ability when using auto_LTO. + Other it may causes error in normal Process. */ + + FILE *file = fopen ("/tmp/ai_flag.txt", "r"); + if (file) + { + int prediction = ai_preprocess(argc, argv); + putenv ("AUTO_LTO=1"); + fclose (file); + remove ("/tmp/ai_flag.txt"); + } + for (i = 1; argv[i] != NULL; i ++) { if (! strcmp (argv[i], "-debug")) @@ -1184,6 +1410,7 @@ main (int argc, char **argv) { c_file = concat (output_file, ".cdtor.c", NULL); o_file = concat (output_file, ".cdtor.o", NULL); + ai_optimize_file = concat (output_file, ".ai_optimize.o", NULL); #ifdef COLLECT_EXPORT_LIST export_file = concat (output_file, ".x", NULL); #endif @@ -1192,6 +1419,7 @@ main (int argc, char **argv) { c_file = make_temp_file (".cdtor.c"); o_file = make_temp_file (".cdtor.o"); + ai_optimize_file = make_temp_file (".ai_optimize.o"); #ifdef COLLECT_EXPORT_LIST export_file = make_temp_file (".x"); #endif diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index e67e77e6a..83b8ebe8d 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14514,14 +14514,26 @@ override_Fortran_optimize_options (struct gcc_options *opts) opts->x_param_flexible_seg_len = 1; } +static void +override_lto_option (struct gcc_options *opts) +{ + opts->x_flag_lto = "auto"; + opts->x_flag_fat_lto_objects = 1; +} + /* Reset the optimize option. After checking the model result, this function can reset the more appropriate options. */ + static void reset_machine_option (struct gcc_options *opts) { + /* Parsing mcpu=native will have extra info after, then length + would greater than 6. */ if (!(opts->x_optimize_machine) - || strstr (opts->x_aarch64_tune_string, "hip09") == NULL) + || !(strstr (opts->x_aarch64_cpu_string, "hip09") != NULL + || strstr (opts->x_aarch64_cpu_string, "tsv110") != NULL) + && (strlen (opts->x_aarch64_cpu_string) > 6)) { return; } @@ -14543,6 +14555,16 @@ reset_machine_option (struct gcc_options *opts) override_Fortran_optimize_options (opts); } } + else + { + override_lto_option (opts); + FILE *file = fopen ("/tmp/ai_flag.txt", "w"); + if (file) + { + fprintf (file, "Do the link time optimization.\n"); + fclose (file); + } + } } /* Implement targetm.vectorize.add_stmt_cost. */ diff --git a/gcc/ipa-hardware-detection.c b/gcc/ipa-hardware-detection.c index f127ebe2c..079099783 100644 --- a/gcc/ipa-hardware-detection.c +++ b/gcc/ipa-hardware-detection.c @@ -38,115 +38,19 @@ along with GCC; see the file COPYING3.  If not see #include "print-tree.h" #include "cfghooks.h" #include "gimple-fold.h" +#include "basic-block.h" namespace { -static basic_block -create_abort_bb (basic_block last_bb) +/* Get the target function. */ +bool +target_func_p (tree fn_decl, const char* target) { - basic_block bb = create_empty_bb (last_bb); - if (last_bb->loop_father != NULL) - { - add_bb_to_loop (bb, last_bb->loop_father); - loops_state_set (LOOPS_NEED_FIXUP); - } - gimple_stmt_iterator gsi = gsi_last_bb (bb); - tree fn = builtin_decl_implicit (BUILT_IN_ABORT); - gimple *g = gimple_build_call (fn, 0); - gsi_insert_after (&gsi, g, GSI_NEW_STMT); - return bb; -} - -static basic_block -create_part_bb (basic_block last_bb, tree part_base) -{ - basic_block bb = create_empty_bb (last_bb); - if (last_bb->loop_father != NULL) - { - add_bb_to_loop (bb, last_bb->loop_father); - loops_state_set (LOOPS_NEED_FIXUP); - } - gimple_stmt_iterator gsi = gsi_last_bb (bb); - gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT); - /* This number is used to efficiently identify the supported part range. */ - tree part_cond = gimplify_build2 ( - &gsi, PLUS_EXPR, unsigned_type_node, part_base, - build_int_cst (unsigned_type_node, 4294963967)); - gcond *cond = gimple_build_cond (LE_EXPR, part_cond, - build_int_cst (unsigned_type_node, 2), - NULL_TREE, NULL_TREE); - gimple_set_location (cond, input_location); - gsi_insert_before (&gsi, cond, GSI_SAME_STMT); - gsi_remove (&gsi, true); - return bb; + const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn_decl)); + return (fn_name && sizeof (fn_name) == sizeof (target) + && strncmp (fn_name, target, sizeof (target) - 1) == 0); } -static void -create_detection_bb () -{ - edge old_e = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); - basic_block ret_bb = old_e->dest; - - basic_block detection_bb = create_empty_bb (ENTRY_BLOCK_PTR_FOR_FN (cfun)); - if (ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father != NULL) - { - add_bb_to_loop (detection_bb, ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father); - loops_state_set (LOOPS_NEED_FIXUP); - } - tree cpuid_decl = build_decl (input_location, VAR_DECL, - get_identifier ("cpuid"), unsigned_type_node); - add_local_decl (cfun, cpuid_decl); - - gimple_stmt_iterator gsi = gsi_last_bb (detection_bb); - vec *outputs = NULL; - tree purpose = build_string (strlen ("=r"), "=r"); - tree output = build_tree_list ( - build_tree_list (NULL_TREE, purpose), cpuid_decl); - vec_safe_push (outputs, output); - gasm *asm_stmt = gimple_build_asm_vec ( - "mrs %0, MIDR_EL1", NULL, outputs, NULL, NULL); - gsi_insert_after (&gsi, asm_stmt, GSI_NEW_STMT); - gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT); - - tree implementer = gimplify_build2 ( - &gsi, RSHIFT_EXPR, unsigned_type_node, cpuid_decl, - build_int_cst (unsigned_type_node, 24)); - tree part_base = gimplify_build2 ( - &gsi, RSHIFT_EXPR, unsigned_type_node, cpuid_decl, - build_int_cst (unsigned_type_node, 4)); - tree part = gimplify_build2 ( - &gsi, BIT_AND_EXPR, unsigned_type_node, part_base, - build_int_cst (unsigned_type_node, 4095)); - gcond *implementer_cond = gimple_build_cond ( - EQ_EXPR, implementer, - build_int_cst (unsigned_type_node, 72), - NULL_TREE, NULL_TREE); - gimple_set_location (implementer_cond, input_location); - gsi_insert_before (&gsi, implementer_cond, GSI_SAME_STMT); - gsi_remove (&gsi, true); - - basic_block part_bb = create_part_bb (detection_bb, part); - basic_block abort_bb = create_abort_bb (part_bb); - - remove_edge_raw (old_e); - make_single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun), - detection_bb, EDGE_FALLTHRU); - edge etrue = make_edge (detection_bb, part_bb, EDGE_TRUE_VALUE); - etrue->probability = profile_probability::likely (); - edge efalse = make_edge (detection_bb, abort_bb, EDGE_FALSE_VALUE); - efalse->probability = profile_probability::unlikely (); - edge part_true = make_edge (part_bb, ret_bb, EDGE_TRUE_VALUE); - part_true->probability = profile_probability::likely (); - edge part_false = make_edge (part_bb, abort_bb, EDGE_FALSE_VALUE); - part_false->probability = profile_probability::unlikely (); - make_single_succ_edge (abort_bb, ret_bb, EDGE_FALLTHRU); - if (dom_info_available_p (CDI_DOMINATORS)) - { - set_immediate_dominator (CDI_DOMINATORS, part_bb, detection_bb); - set_immediate_dominator (CDI_DOMINATORS, ret_bb, detection_bb); - set_immediate_dominator (CDI_DOMINATORS, abort_bb, detection_bb); - } -} const pass_data pass_data_ipa_hardware_detection = { @@ -176,10 +80,8 @@ bool pass_ipa_hardware_detection::gate (function *) { const char *ai_infer_level = getenv ("AI_INFER_LEVEL"); - return (ai_infer_level - && optimize_machine > 0 - /* Only enable in lto or whole_program.  */ - && (in_lto_p || flag_whole_program)); + const char *ai_lto_option = getenv ("AI_LTO_OPTION"); + return ((ai_lto_option || (ai_infer_level && optimize_machine > 0)) && (in_lto_p || flag_whole_program)); } unsigned int @@ -187,6 +89,25 @@ pass_ipa_hardware_detection::execute (function *) { unsigned int ret = 0; cgraph_node *cnode; + gcall* call_stmt = NULL; + tree fntype_void_void = build_function_type_array (void_type_node, 0, NULL); + tree fndecl_decl = build_fn_decl ("get_ai_info", fntype_void_void); + + DECL_EXTERNAL (fndecl_decl) = 1; + TREE_PUBLIC (fndecl_decl) = 1; + DECL_CONTEXT (fndecl_decl) = NULL; + struct cgraph_node *node = cgraph_node::create (fndecl_decl); + + FOR_EACH_FUNCTION (cnode) + { + const char *func_name = IDENTIFIER_POINTER (DECL_NAME (cnode->decl)); + if (target_func_p (cnode->decl, "get_ai_info")) + { + call_stmt = gimple_build_call (cnode->decl, 0); + break; + } + } + FOR_EACH_FUNCTION (cnode) { if (!cnode->real_symbol_p ()) @@ -207,12 +128,10 @@ pass_ipa_hardware_detection::execute (function *) && MAIN_NAME_P (DECL_NAME (cnode->decl))) { push_cfun (fn); - calculate_dominance_info (CDI_DOMINATORS); - - create_detection_bb (); - - cgraph_edge::rebuild_edges (); - free_dominance_info (CDI_DOMINATORS); + basic_block first_block = single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun)); + gimple_stmt_iterator gsi = gsi_start_bb (first_block); + if (call_stmt) + gsi_insert_before (&gsi, call_stmt, GSI_NEW_STMT); pop_cfun (); } } diff --git a/gcc/opts-common.c b/gcc/opts-common.c index 52e28e2dc..c6c32a366 100644 --- a/gcc/opts-common.c +++ b/gcc/opts-common.c @@ -1009,12 +1009,12 @@ handle_lto_option (unsigned int lang_mask, if (strstr (lan, "gcc") != NULL) { opt_array = XRESIZEVEC (struct cl_decoded_option, opt_array, argc + 2); - const char* lto_flag = "-flto=8"; + const char* lto_flag = "-flto=auto"; decode_cmdline_option (<o_flag, lang_mask, &opt_array[num_decoded_options]); ret++; - const char* ltopartition_flag = "-flto-partition=one"; - decode_cmdline_option (<opartition_flag, lang_mask, + const char* fat_lto_objects_flag = "-ffat-lto-objects"; + decode_cmdline_option (&fat_lto_objects_flag, lang_mask, &opt_array[num_decoded_options + 1]); ret++; } @@ -1022,7 +1022,7 @@ handle_lto_option (unsigned int lang_mask, || strstr (lan, "gfortran") != NULL) { opt_array = XRESIZEVEC (struct cl_decoded_option, opt_array, argc + 1); - const char* lto_flag = "-flto=8"; + const char* lto_flag = "-flto=auto"; decode_cmdline_option (<o_flag, lang_mask, &opt_array[num_decoded_options]); ret++; @@ -1040,25 +1040,42 @@ handle_machine_option (unsigned int lang_mask, struct cl_decoded_option *&opt_array) { int ret = 0; - bool flag_Om = false; bool flag_hip09 = false; for (unsigned i = 1; i < argc; i ++) { - if (strcmp (argv[i], "-Om") == 0) - flag_Om = true; - if (strstr (argv[i], "mcpu=hip09") != NULL) - flag_hip09 = true; + if (strstr(argv[i], "mcpu=native") != NULL) + { + FILE *f = fopen("/proc/cpuinfo", "r"); + if (f == NULL) + { + perror("Failed to open /proc/cpuinfo"); + return -1; + } + + char buf[256]; + + while (fgets(buf, sizeof(buf), f) != NULL) + { + buf[strcspn(buf, "\n")] = 0; + if (strstr(buf, "CPU implementer") != NULL) + { + if (strstr(buf, "0x48") != NULL) + { + flag_hip09 = true; + break; + } + } + } + fclose(f); + } } - if (!flag_hip09 || !flag_Om) - { + if (!flag_hip09) return ret; - } const char *ai_infer_level = getenv ("AI_INFER_LEVEL"); if (ai_infer_level) - { - return ret; - } + return ret; + int argc_hw = 6; int64_t argv_hw[argc_hw] = { global_options.x_param_simultaneous_prefetches, -- 2.33.0