diff --git a/0006-AArch64-Add-CFG-block-count-correction-optimization.patch b/0006-AArch64-Add-CFG-block-count-correction-optimization.patch new file mode 100644 index 0000000..b90b76d --- /dev/null +++ b/0006-AArch64-Add-CFG-block-count-correction-optimization.patch @@ -0,0 +1,1820 @@ +From 25c9e9c7d4532f6e8962a25c5c7087bf3e3b8445 Mon Sep 17 00:00:00 2001 +From: rfwang07 +Date: Thu, 25 Jul 2024 14:45:53 +0800 +Subject: [PATCH] Add CFG block count correction optimization. + +--- + bolt/include/bolt/Core/BinaryBasicBlock.h | 59 +- + .../bolt/Core/BinaryBasicBlockFeature.h | 268 ++++++++ + bolt/include/bolt/Passes/FeatureMiner.h | 176 ++++++ + bolt/include/bolt/Passes/StaticBranchInfo.h | 108 ++++ + bolt/include/bolt/Profile/DataReader.h | 93 ++- + bolt/lib/Core/BinaryBasicBlockFeature.cpp | 21 + + bolt/lib/Core/CMakeLists.txt | 1 + + bolt/lib/Passes/CMakeLists.txt | 2 + + bolt/lib/Passes/FeatureMiner.cpp | 572 ++++++++++++++++++ + bolt/lib/Passes/StaticBranchInfo.cpp | 143 +++++ + bolt/lib/Profile/DataReader.cpp | 120 +++- + bolt/lib/Rewrite/RewriteInstance.cpp | 6 + + 12 files changed, 1557 insertions(+), 12 deletions(-) + create mode 100644 bolt/include/bolt/Core/BinaryBasicBlockFeature.h + create mode 100644 bolt/include/bolt/Passes/FeatureMiner.h + create mode 100644 bolt/include/bolt/Passes/StaticBranchInfo.h + create mode 100644 bolt/lib/Core/BinaryBasicBlockFeature.cpp + create mode 100644 bolt/lib/Passes/FeatureMiner.cpp + create mode 100644 bolt/lib/Passes/StaticBranchInfo.cpp + +diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h +index 02be9c1d4..a39d38d6b 100644 +--- a/bolt/include/bolt/Core/BinaryBasicBlock.h ++++ b/bolt/include/bolt/Core/BinaryBasicBlock.h +@@ -15,6 +15,7 @@ + #ifndef BOLT_CORE_BINARY_BASIC_BLOCK_H + #define BOLT_CORE_BINARY_BASIC_BLOCK_H + ++#include "bolt/Core/BinaryBasicBlockFeature.h" + #include "bolt/Core/FunctionLayout.h" + #include "bolt/Core/MCPlus.h" + #include "llvm/ADT/GraphTraits.h" +@@ -25,6 +26,7 @@ + #include "llvm/Support/raw_ostream.h" + #include + #include ++#include + + namespace llvm { + class MCCodeEmitter; +@@ -147,6 +149,12 @@ private: + /// Last computed hash value. + mutable uint64_t Hash{0}; + ++ std::set ChildrenSet; ++ ++ std::set ParentSet; ++ ++ BinaryBasicBlockFeature BlockFeatures; ++ + private: + BinaryBasicBlock() = delete; + BinaryBasicBlock(const BinaryBasicBlock &) = delete; +@@ -385,11 +393,14 @@ public: + /// If the basic block ends with a conditional branch (possibly followed by + /// an unconditional branch) and thus has 2 successors, return a successor + /// corresponding to a jump condition which could be true or false. +- /// Return nullptr if the basic block does not have a conditional jump. ++ /// Return the only successor if it's followed by an unconditional branch. ++ /// Return nullptr otherwise. + BinaryBasicBlock *getConditionalSuccessor(bool Condition) { +- if (succ_size() != 2) +- return nullptr; +- return Successors[Condition == true ? 0 : 1]; ++ if (succ_size() == 2) ++ return Successors[Condition == true ? 0 : 1]; ++ if (succ_size() == 1) ++ return Successors[0]; ++ return nullptr; + } + + const BinaryBasicBlock *getConditionalSuccessor(bool Condition) const { +@@ -410,6 +421,13 @@ public: + return const_cast(this)->getFallthrough(); + } + ++ /// Return branch info corresponding to only branch. ++ const BinaryBranchInfo &getOnlyBranchInfo() const { ++ assert(BranchInfo.size() > 0 && ++ "could only be called for blocks with at least 1 successor"); ++ return BranchInfo[0]; ++ }; ++ + /// Return branch info corresponding to a taken branch. + const BinaryBranchInfo &getTakenBranchInfo() const { + assert(BranchInfo.size() == 2 && +@@ -818,6 +836,36 @@ public: + OutputAddressRange.second = Address; + } + ++ /// Sets features of this BB. ++ void setFeatures(BinaryBasicBlockFeature BBF) { ++ BlockFeatures = BBF; ++ } ++ ++ /// Gets numberic features of this BB. ++ BinaryBasicBlockFeature getFeatures() { ++ return BlockFeatures; ++ } ++ ++ /// Gets children sets of this BB. ++ std::set getChildrenSet() { ++ return ChildrenSet; ++ } ++ ++ /// Gets parent sets of this BB. ++ std::set getParentSet() { ++ return ParentSet; ++ } ++ ++ /// Inserts children sets of this BB. ++ void insertChildrenSet(BinaryBasicBlock *Node) { ++ ChildrenSet.insert(Node); ++ } ++ ++ /// Inserts parent sets of this BB. ++ void insertParentSet(BinaryBasicBlock *Node) { ++ ParentSet.insert(Node); ++ } ++ + /// Gets the memory address range of this BB in the input binary. + std::pair getInputAddressRange() const { + return InputRange; +@@ -991,7 +1039,8 @@ private: + #if defined(LLVM_ON_UNIX) + /// Keep the size of the BinaryBasicBlock within a reasonable size class + /// (jemalloc bucket) on Linux +-static_assert(sizeof(BinaryBasicBlock) <= 256); ++/// The size threshod is expanded from 256 to 2048 to contain the extra BB features ++static_assert(sizeof(BinaryBasicBlock) <= 2048, ""); + #endif + + bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); +diff --git a/bolt/include/bolt/Core/BinaryBasicBlockFeature.h b/bolt/include/bolt/Core/BinaryBasicBlockFeature.h +new file mode 100644 +index 000000000..2b4809b1a +--- /dev/null ++++ b/bolt/include/bolt/Core/BinaryBasicBlockFeature.h +@@ -0,0 +1,268 @@ ++//===- bolt/Core/BinaryBasicBlockFeature.h - Low-level basic block -----*- C++ ++//-*-===// ++// ++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. ++// See https://llvm.org/LICENSE.txt for license information. ++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ++// ++//===----------------------------------------------------------------------===// ++// ++// Features of BinaryBasicBlock ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef BOLT_CORE_BINARY_BASIC_BLOCK_FEATURE_H ++#define BOLT_CORE_BINARY_BASIC_BLOCK_FEATURE_H ++ ++#include "bolt/Core/FunctionLayout.h" ++#include "bolt/Core/MCPlus.h" ++#include "llvm/ADT/GraphTraits.h" ++#include "llvm/ADT/StringRef.h" ++#include "llvm/MC/MCInst.h" ++#include "llvm/MC/MCSymbol.h" ++#include "llvm/Support/ErrorOr.h" ++#include "llvm/Support/raw_ostream.h" ++#include ++#include ++ ++namespace llvm { ++ ++namespace bolt { ++ ++class BinaryBasicBlockFeature { ++ ++public: ++ int32_t Opcode; ++ ++ int16_t Direction; ++ ++ int32_t CmpOpcode; ++ ++ int16_t LoopHeader; ++ ++ int16_t ProcedureType; ++ ++ int64_t Count; ++ ++ int64_t FallthroughCount; ++ ++ int64_t TotalLoops; ++ ++ int64_t LoopDepth; ++ ++ int64_t LoopNumBlocks; ++ ++ int64_t LocalExitingBlock; ++ ++ int64_t LocalLatchBlock; ++ ++ int64_t LocalLoopHeader; ++ ++ int64_t Call; ++ ++ int64_t DeltaTaken; ++ ++ int64_t NumLoads; ++ ++ int64_t NumCalls; ++ ++ int64_t OperandRAType; ++ ++ int64_t OperandRBType; ++ ++ int64_t BasicBlockSize; ++ ++ int64_t NumBasicBlocks; ++ ++ int64_t HasIndirectCalls; ++ ++ std::vector EndOpcode_vec; ++ ++ std::vector LoopHeader_vec; ++ ++ std::vector Backedge_vec; ++ ++ std::vector Exit_vec; ++ ++ std::vector Call_vec; ++ ++ std::vector BasicBlockSize_vec; ++ ++ std::vector InferenceFeatures; ++ ++ uint64_t FuncExec; ++ ++ int32_t ParentChildNum; ++ ++ int32_t ParentCount; ++ ++ int32_t ChildParentNum; ++ ++ int32_t ChildCount; ++ ++public: ++ void setOpcode(const int32_t &BlockOpcode) { Opcode = BlockOpcode; } ++ ++ void setDirection(const int16_t &BlockDirection) { ++ Direction = BlockDirection; ++ } ++ ++ void setCmpOpcode(const int32_t &BlockCmpOpcode) { ++ CmpOpcode = BlockCmpOpcode; ++ } ++ ++ void setLoopHeader(const int16_t &BlockLoopHeader) { ++ LoopHeader = BlockLoopHeader; ++ } ++ ++ void setProcedureType(const int16_t &BlockProcedureType) { ++ ProcedureType = BlockProcedureType; ++ } ++ ++ void setCount(const int64_t &BlockCount) { Count = BlockCount; } ++ ++ void setFallthroughCount(const int64_t &BlockFallthroughCount) { ++ FallthroughCount = BlockFallthroughCount; ++ } ++ ++ void setTotalLoops(const int64_t &BlockTotalLoops) { ++ TotalLoops = BlockTotalLoops; ++ } ++ ++ void setLoopDepth(const int64_t &BlockLoopDepth) { ++ LoopDepth = BlockLoopDepth; ++ } ++ ++ void setLoopNumBlocks(const int64_t &BlockLoopNumBlocks) { ++ LoopNumBlocks = BlockLoopNumBlocks; ++ } ++ ++ void setLocalExitingBlock(const int64_t &BlockLocalExitingBlock) { ++ LocalExitingBlock = BlockLocalExitingBlock; ++ } ++ ++ void setLocalLatchBlock(const int64_t &BlockLocalLatchBlock) { ++ LocalLatchBlock = BlockLocalLatchBlock; ++ } ++ ++ void setLocalLoopHeader(const int64_t &BlockLocalLoopHeader) { ++ LocalLoopHeader = BlockLocalLoopHeader; ++ } ++ ++ void setDeltaTaken(const int64_t &BlockDeltaTaken) { ++ DeltaTaken = BlockDeltaTaken; ++ } ++ ++ void setNumLoads(const int64_t &BlockNumLoads) { NumLoads = BlockNumLoads; } ++ ++ void setNumCalls(const int64_t &BlockNumCalls) { NumCalls = BlockNumCalls; } ++ ++ void setOperandRAType(const int64_t &BlockOperandRAType) { ++ OperandRAType = BlockOperandRAType; ++ } ++ ++ void setOperandRBType(const int64_t &BlockOperandRBType) { ++ OperandRBType = BlockOperandRBType; ++ } ++ ++ void setBasicBlockSize(const int64_t &BlockBasicBlockSize) { ++ BasicBlockSize = BlockBasicBlockSize; ++ } ++ ++ void setNumBasicBlocks(const int64_t &BlockNumBasicBlocks) { ++ NumBasicBlocks = BlockNumBasicBlocks; ++ } ++ ++ void setHasIndirectCalls(const int64_t &BlockHasIndirectCalls) { ++ HasIndirectCalls = BlockHasIndirectCalls; ++ } ++ ++ void setEndOpcodeVec(const int32_t &EndOpcode) { ++ EndOpcode_vec.push_back(EndOpcode); ++ } ++ ++ void setLoopHeaderVec(const int16_t &LoopHeader) { ++ LoopHeader_vec.push_back(LoopHeader); ++ } ++ ++ void setBackedgeVec(const int16_t &Backedge) { ++ Backedge_vec.push_back(Backedge); ++ } ++ ++ void setExitVec(const int16_t &Exit) { Exit_vec.push_back(Exit); } ++ ++ void setCallVec(const int16_t &Call) { Call_vec.push_back(Call); } ++ ++ void setBasicBlockSizeVec(const int64_t &BasicBlockSize) { ++ BasicBlockSize_vec.push_back(BasicBlockSize); ++ } ++ ++ void setFunExec(const uint64_t &BlockFuncExec) { FuncExec = BlockFuncExec; } ++ ++ void setParentChildNum(const int32_t &BlockParentChildNum) { ++ ParentChildNum = BlockParentChildNum; ++ } ++ ++ void setParentCount(const int32_t &BlockParentCount) { ++ ParentCount = BlockParentCount; ++ } ++ ++ void setChildParentNum(const int32_t &BlockChildParentNum) { ++ ChildParentNum = BlockChildParentNum; ++ } ++ ++ void setChildCount(const int32_t &BlockChildCount) { ++ ChildCount = BlockChildCount; ++ } ++ ++ void setInferenceFeatures() { ++ ++ if (Count == -1 || FallthroughCount == -1) { ++ return; ++ } ++ if (ParentChildNum == -1 && ParentCount == -1 && ChildParentNum == -1 && ++ ChildCount == -1) { ++ return; ++ } ++ ++ InferenceFeatures.push_back(static_cast(Direction)); ++ InferenceFeatures.push_back(static_cast(LoopHeader)); ++ InferenceFeatures.push_back(static_cast(ProcedureType)); ++ InferenceFeatures.push_back(static_cast(OperandRAType)); ++ InferenceFeatures.push_back(static_cast(OperandRBType)); ++ InferenceFeatures.push_back(static_cast(LoopHeader_vec[0])); ++ InferenceFeatures.push_back(static_cast(Backedge_vec[0])); ++ InferenceFeatures.push_back(static_cast(Exit_vec[0])); ++ InferenceFeatures.push_back(static_cast(LoopHeader_vec[1])); ++ InferenceFeatures.push_back(static_cast(Call_vec[0])); ++ InferenceFeatures.push_back(static_cast(LocalExitingBlock)); ++ InferenceFeatures.push_back(static_cast(HasIndirectCalls)); ++ InferenceFeatures.push_back(static_cast(LocalLatchBlock)); ++ InferenceFeatures.push_back(static_cast(LocalLoopHeader)); ++ InferenceFeatures.push_back(static_cast(Opcode)); ++ InferenceFeatures.push_back(static_cast(CmpOpcode)); ++ InferenceFeatures.push_back(static_cast(EndOpcode_vec[0])); ++ InferenceFeatures.push_back(static_cast(EndOpcode_vec[1])); ++ InferenceFeatures.push_back(static_cast(FuncExec)); ++ InferenceFeatures.push_back(static_cast(NumBasicBlocks)); ++ InferenceFeatures.push_back(static_cast(BasicBlockSize)); ++ InferenceFeatures.push_back(static_cast(BasicBlockSize_vec[0])); ++ InferenceFeatures.push_back(static_cast(BasicBlockSize_vec[1])); ++ InferenceFeatures.push_back(static_cast(LoopNumBlocks)); ++ InferenceFeatures.push_back(static_cast(NumLoads)); ++ InferenceFeatures.push_back(static_cast(NumCalls)); ++ InferenceFeatures.push_back(static_cast(TotalLoops)); ++ InferenceFeatures.push_back(static_cast(DeltaTaken)); ++ InferenceFeatures.push_back(static_cast(LoopDepth)); ++ InferenceFeatures.push_back(static_cast(ParentChildNum)); ++ InferenceFeatures.push_back(static_cast(ParentCount)); ++ InferenceFeatures.push_back(static_cast(ChildParentNum)); ++ InferenceFeatures.push_back(static_cast(ChildCount)); ++ } ++ ++ std::vector getInferenceFeatures() { return InferenceFeatures; } ++}; ++} // namespace bolt ++} // namespace llvm ++ ++#endif +\ No newline at end of file +diff --git a/bolt/include/bolt/Passes/FeatureMiner.h b/bolt/include/bolt/Passes/FeatureMiner.h +new file mode 100644 +index 000000000..6170aa62d +--- /dev/null ++++ b/bolt/include/bolt/Passes/FeatureMiner.h +@@ -0,0 +1,176 @@ ++//===--- Passes/FeatureMiner.h ++//---------------------------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// A very simple feature extractor based on Calder's paper ++// Evidence-based static branch prediction using machine learning ++// https://dl.acm.org/doi/10.1145/239912.239923 ++//===----------------------------------------------------------------------===// ++ ++#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FEATUREMINER_H_ ++#define LLVM_TOOLS_LLVM_BOLT_PASSES_FEATUREMINER_H_ ++ ++#include "bolt/Core/BinaryData.h" ++#include "bolt/Core/BinaryFunction.h" ++#include "bolt/Core/BinaryLoop.h" ++#include "bolt/Passes/BinaryPasses.h" ++#include "bolt/Passes/DominatorAnalysis.h" ++#include "bolt/Passes/StaticBranchInfo.h" ++#include "llvm/ADT/DenseMap.h" ++#include "llvm/ADT/StringRef.h" ++#include "llvm/MC/MCInst.h" ++#include "llvm/Support/raw_ostream.h" ++#include ++#include ++#include ++#include ++#include ++ ++namespace llvm { ++namespace bolt { ++ ++class FeatureMiner : public BinaryFunctionPass { ++private: ++ std::unique_ptr SBI; ++ /// BasicBlockInfo - This structure holds feature information about the target ++ /// BasicBlock of either the taken or the fallthrough paths of a given branch. ++ struct BasicBlockInfo { ++ std::optional BranchDominates; // 1 - dominates, 0 - does not dominate ++ std::optional ++ BranchPostdominates; // 1 - postdominates, 0 - does not PD ++ std::optional LoopHeader; // 1 - loop header, 0 - not a loop header ++ std::optional Backedge; // 1 - loop back, 0 - not a loop back ++ std::optional Exit; // 1 - loop exit, 0 - not a loop exit ++ std::optional Call; // 1 - program call, 0 - not a program call ++ std::optional NumCalls; ++ std::optional NumLoads; ++ std::optional NumStores; ++ std::optional EndOpcode; // 0 = NOTHING ++ std::string EndOpcodeStr = "UNDEF"; ++ std::optional BasicBlockSize; ++ std::string FromFunName = "UNDEF"; ++ uint32_t FromBb; ++ std::string ToFunName = "UNDEF"; ++ uint32_t ToBb; ++ ++ std::optional NumCallsExit; ++ std::optional NumCallsInvoke; ++ std::optional NumIndirectCalls; ++ std::optional NumTailCalls; ++ }; ++ ++ typedef std::unique_ptr BBIPtr; ++ ++ /// BranchFeaturesInfo - This structure holds feature information about each ++ /// two-way branch from the program. ++ struct BranchFeaturesInfo { ++ std::string OpcodeStr = "UNDEF"; ++ std::string CmpOpcodeStr = "UNDEF"; ++ bool Simple = 0; ++ ++ std::optional Opcode; ++ std::optional CmpOpcode; ++ std::optional Count; ++ std::optional MissPredicted; ++ std::optional FallthroughCount; ++ std::optional FallthroughMissPredicted; ++ BBIPtr TrueSuccessor = std::make_unique(); ++ BBIPtr FalseSuccessor = std::make_unique(); ++ std::optional ProcedureType; // 1 - Leaf, 0 - NonLeaf, 2 - CallSelf ++ std::optional LoopHeader; // 1 — loop header, 0 - not a loop header ++ std::optional Direction; // 1 - Forward Branch, 0 - Backward Branch ++ ++ std::optional NumOuterLoops; ++ std::optional TotalLoops; ++ std::optional MaximumLoopDepth; ++ std::optional LoopDepth; ++ std::optional LoopNumExitEdges; ++ std::optional LoopNumExitBlocks; ++ std::optional LoopNumExitingBlocks; ++ std::optional LoopNumLatches; ++ std::optional LoopNumBlocks; ++ std::optional LoopNumBackEdges; ++ std::optional NumLoads; ++ std::optional NumStores; ++ ++ std::optional LocalExitingBlock; ++ std::optional LocalLatchBlock; ++ std::optional LocalLoopHeader; ++ std::optional Call; ++ ++ std::optional NumCalls; ++ std::optional NumCallsExit; ++ std::optional NumCallsInvoke; ++ std::optional NumIndirectCalls; ++ std::optional NumTailCalls; ++ std::optional NumSelfCalls; ++ ++ std::optional NumBasicBlocks; ++ ++ std::optional DeltaTaken; ++ ++ std::optional OperandRAType; ++ std::optional OperandRBType; ++ ++ std::optional BasicBlockSize; ++ ++ std::optional BranchOffset; ++ }; ++ ++ typedef std::unique_ptr BFIPtr; ++ ++ std::vector BranchesInfoSet; ++ ++ /// getProcedureType - Determines which category the function falls into: ++ /// Leaf, Non-leaf or Calls-self. ++ int8_t getProcedureType(BinaryFunction &Function, BinaryContext &BC); ++ ++ /// addSuccessorInfo - Discovers feature information for the target successor ++ /// basic block, and inserts it into the static branch info container. ++ void addSuccessorInfo(BFIPtr const &BFI, BinaryFunction &Function, ++ BinaryContext &BC, BinaryBasicBlock &BB, bool SuccType); ++ ++ /// extractFeatures - Extracts the feature information for each two-way branch ++ /// from the program. ++ void extractFeatures(BinaryFunction &Function, BinaryContext &BC); ++ ++ void generateInstFeatures(BinaryContext &BC, BinaryBasicBlock &BB, ++ BFIPtr const &BFI, int Index); ++ /// dumpSuccessorFeatures - Dumps the feature information about the target ++ /// BasicBlock of either the taken or the fallthrough paths of a given branch. ++ void generateSuccessorFeatures(BBIPtr &Successor, ++ BinaryBasicBlockFeature *BBF); ++ ++ /// dumpFeatures - Dumps the feature information about each two-way branch ++ /// from the program. ++ void dumpFeatures(raw_ostream &Printer, uint64_t FunctionAddress, ++ uint64_t FunctionFrequency); ++ ++ /// dumpProfileData - Dumps a limited version of the inout profile data ++ /// that contains only profile for conditional branches, unconditional ++ /// branches and terminators that aren't branches. ++ void dumpProfileData(BinaryFunction &Function, raw_ostream &Printer); ++ ++public: ++ explicit FeatureMiner(const cl::opt &PrintPass) ++ : BinaryFunctionPass(PrintPass) {} ++ ++ std::ofstream trainPrinter; ++ ++ const char *getName() const override { return "feature-miner"; } ++ ++ void runOnFunctions(BinaryContext &BC) override; ++ void inferenceFeatures(BinaryFunction &Function); ++ void generateProfileFeatures(BinaryBasicBlock *BB, ++ BinaryBasicBlockFeature *BBF); ++}; ++ ++} // namespace bolt ++} // namespace llvm ++ ++#endif /* LLVM_TOOLS_LLVM_BOLT_PASSES_FEATUREMINER_H_ */ +diff --git a/bolt/include/bolt/Passes/StaticBranchInfo.h b/bolt/include/bolt/Passes/StaticBranchInfo.h +new file mode 100644 +index 000000000..8de8df793 +--- /dev/null ++++ b/bolt/include/bolt/Passes/StaticBranchInfo.h +@@ -0,0 +1,108 @@ ++//===------ Passes/StaticBranchInfo.h -------------------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This is an auxiliary class to the feature miner, static branch probability ++// and frequency passes. This class is responsible for finding loop info (loop ++// back edges, loop exit edges and loop headers) of a function. It also finds ++// basic block info (if a block contains store and call instructions) and if a ++// basic block contains a call to the exit. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STATICBRANCHINFO_H_ ++#define LLVM_TOOLS_LLVM_BOLT_PASSES_STATICBRANCHINFO_H_ ++ ++#include "bolt/Core/BinaryContext.h" ++#include "bolt/Core/BinaryFunction.h" ++#include "bolt/Core/BinaryLoop.h" ++#include "llvm/MC/MCSymbol.h" ++#include ++ ++namespace llvm { ++namespace bolt { ++ ++class StaticBranchInfo { ++ ++public: ++ /// An edge indicates that a control flow may go from a basic block (source) ++ /// to an other one (destination), and this pair of basic blocks will be used ++ /// to index maps and retrieve content of sets. ++ typedef std::pair Edge; ++ ++private: ++ /// Holds the loop headers of a given function. ++ DenseSet LoopHeaders; ++ ++ /// Holds the loop backedges of a given function. ++ DenseSet BackEdges; ++ ++ /// Holds the loop exit edges of a given function. ++ DenseSet ExitEdges; ++ ++ /// Holds the basic blocks of a given function ++ /// that contains at least one call instructions. ++ DenseSet CallSet; ++ ++ /// Holds the basic blocks of a given function ++ /// that contains at least one store instructions. ++ DenseSet StoreSet; ++ ++ unsigned NumLoads; ++ unsigned NumStores; ++ ++public: ++ unsigned getNumLoads() { return NumLoads; } ++ ++ unsigned getNumStores() { return NumStores; } ++ ++ /// findLoopEdgesInfo - Finds all loop back edges, loop exit eges ++ /// and loop headers within the function. ++ void findLoopEdgesInfo(const BinaryLoopInfo &LoopsInfo); ++ ++ /// findBasicBlockInfo - Finds all call and store instructions within ++ /// the basic blocks of a given function. ++ void findBasicBlockInfo(const BinaryFunction &Function, BinaryContext &BC); ++ ++ /// isBackEdge - Checks if the edge is a loop back edge. ++ bool isBackEdge(const Edge &CFGEdge) const; ++ ++ /// isBackEdge - Checks if the edge is a loop back edge. ++ bool isBackEdge(const BinaryBasicBlock *SrcBB, ++ const BinaryBasicBlock *DstBB) const; ++ ++ /// isExitEdge - Checks if the edge is a loop exit edge. ++ bool isExitEdge(const BinaryLoop::Edge &CFGEdge) const; ++ ++ /// isExitEdge - Checks if the edge is a loop exit edge. ++ bool isExitEdge(const BinaryBasicBlock *SrcBB, ++ const BinaryBasicBlock *DstBB) const; ++ ++ /// isLoopHeader - Checks if the basic block is a loop header. ++ bool isLoopHeader(const BinaryBasicBlock *BB) const; ++ ++ /// hasCallInst - Checks if the basic block has a call instruction. ++ bool hasCallInst(const BinaryBasicBlock *BB) const; ++ ++ /// hasStoreInst - Checks if the basic block has a store instruction. ++ bool hasStoreInst(const BinaryBasicBlock *BB) const; ++ ++ /// countBackEdges - Compute the number of BB's successor that are back edges. ++ unsigned countBackEdges(BinaryBasicBlock *BB) const; ++ ++ /// countExitEdges - Compute the number of BB's successor that are exit edges. ++ unsigned countExitEdges(BinaryBasicBlock *BB) const; ++ ++ /// clear - Cleans up all the content from the data structs used. ++ void clear(); ++}; ++ ++} // namespace bolt ++} // namespace llvm ++ ++#endif /* LLVM_TOOLS_LLVM_BOLT_PASSES_STATICBRANCHINFO_H_ */ +diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h +index 916b4f7e2..bf732d47c 100644 +--- a/bolt/include/bolt/Profile/DataReader.h ++++ b/bolt/include/bolt/Profile/DataReader.h +@@ -22,6 +22,7 @@ + #include "llvm/Support/MemoryBuffer.h" + #include "llvm/Support/raw_ostream.h" + #include ++#include + #include + #include + +@@ -44,6 +45,15 @@ inline raw_ostream &operator<<(raw_ostream &OS, const LBREntry &LBR) { + return OS; + } + ++extern "C" { ++typedef void *(*CreateONNXRunnerFunc)(const char *); ++typedef void (*DeleteONNXRunnerFunc)(void *); ++typedef std::vector (*RunONNXModelFunc)(void *, ++ const std::vector &, ++ const std::vector &, ++ const std::vector &, int); ++} ++ + struct Location { + bool IsSymbol; + StringRef Name; +@@ -263,7 +273,8 @@ struct FuncSampleData { + class DataReader : public ProfileReaderBase { + public: + explicit DataReader(StringRef Filename) +- : ProfileReaderBase(Filename), Diag(errs()) {} ++ : ProfileReaderBase(Filename), Diag(errs()), onnxRunner(nullptr), ++ libHandle(nullptr), handleOnnxRuntime(nullptr) {} + + StringRef getReaderName() const override { return "branch profile reader"; } + +@@ -282,7 +293,87 @@ public: + /// Return all event names used to collect this profile + StringSet<> getEventNames() const override { return EventNames; } + ++ ~DataReader() { ++ // delete onnxrunner; ++ if (onnxRunner && libHandle && handleOnnxRuntime) { ++ DeleteONNXRunnerFunc deleteONNXRunner = ++ (DeleteONNXRunnerFunc)dlsym(libHandle, "deleteONNXRunner"); ++ deleteONNXRunner(onnxRunner); ++ dlclose(libHandle); ++ dlclose(handleOnnxRuntime); ++ } ++ } ++ ++ /// Initialize the onnxruntime model. ++ void initializeONNXRunner(const std::string &modelPath) { ++ if (!onnxRunner && !libHandle && !handleOnnxRuntime) { ++ handleOnnxRuntime = ++ dlopen("libonnxruntime.so", RTLD_LAZY | RTLD_GLOBAL); ++ if (handleOnnxRuntime == nullptr) { ++ outs() << "error: llvm-bolt failed during loading onnxruntime.so.\n"; ++ exit(1); ++ } ++ libHandle = dlopen("libONNXRunner.so", RTLD_LAZY); ++ if (libHandle == nullptr) { ++ outs() << "error: llvm-bolt failed during loading libONNXRunner.so.\n"; ++ exit(1); ++ } ++ CreateONNXRunnerFunc createONNXRunner = ++ (CreateONNXRunnerFunc)dlsym(libHandle, "createONNXRunner"); ++ onnxRunner = createONNXRunner(modelPath.c_str()); ++ } ++ } ++ ++ /// Inference step for predicting the BB counts based on the BB features. ++ float ONNXInference(const std::vector &input_string, ++ const std::vector &input_int64, ++ const std::vector &input_float, int batch_size = 1) { ++ if (onnxRunner && libHandle) { ++ RunONNXModelFunc runONNXModel = ++ (RunONNXModelFunc)dlsym(libHandle, "runONNXModel"); ++ std::vector model_preds = runONNXModel( ++ onnxRunner, input_string, input_int64, input_float, batch_size); ++ if (model_preds.size() <= 0) { ++ outs() << "error: llvm-bolt model prediction result cannot be empty.\n"; ++ exit(1); ++ } ++ float pred = model_preds[0]; ++ return pred; ++ } ++ return -1.0; ++ } ++ ++ /// Return the annotating threshold for the model prediction. ++ void setThreshold(float annotate_threshold) { ++ threshold = annotate_threshold; ++ } ++ + protected: ++ /// The onnxruntime model pointer read from the input model path. ++ void *onnxRunner; ++ ++ /// The library handle of the ai4compiler framwork. ++ void *libHandle; ++ ++ /// The library handle of the onnxruntime. ++ void *handleOnnxRuntime; ++ ++ /// The annotating threshold for the model prediction. ++ float threshold; ++ ++ /// Return the annotating threshold for the model prediction. ++ float getThreshold() const { return threshold; } ++ ++ /// The counting value of the total modified BB-count number. ++ uint64_t modified_BB_total = 0; ++ ++ /// Add the total modified BB-count number by the BB modifiied number within ++ /// the funciton. ++ void addModifiedBBTotal(uint64_t &value) { modified_BB_total += value; } ++ ++ /// Return the counting value of the total modified BB-count number. ++ uint64_t getModifiedBBTotal() const { return modified_BB_total; } ++ + /// Read profile information available for the function. + void readProfile(BinaryFunction &BF); + +diff --git a/bolt/lib/Core/BinaryBasicBlockFeature.cpp b/bolt/lib/Core/BinaryBasicBlockFeature.cpp +new file mode 100644 +index 000000000..e1a2a3dd8 +--- /dev/null ++++ b/bolt/lib/Core/BinaryBasicBlockFeature.cpp +@@ -0,0 +1,21 @@ ++//===- bolt/Core/BinaryBasicBlockFeature.cpp - Low-level basic block ++//-------------===// ++// ++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. ++// See https://llvm.org/LICENSE.txt for license information. ++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ++// ++//===----------------------------------------------------------------------===// ++// ++// This file implements the BinaryBasicBlock class. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "bolt/Core/BinaryBasicBlock.h" ++#include "bolt/Core/BinaryBasicBlockFeature.h" ++ ++#define DEBUG_TYPE "bolt" ++ ++namespace llvm { ++namespace bolt {} // namespace bolt ++} // namespace llvm +\ No newline at end of file +diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt +index a4612fb93..f93147d39 100644 +--- a/bolt/lib/Core/CMakeLists.txt ++++ b/bolt/lib/Core/CMakeLists.txt +@@ -12,6 +12,7 @@ set(LLVM_LINK_COMPONENTS + + add_llvm_library(LLVMBOLTCore + BinaryBasicBlock.cpp ++ BinaryBasicBlockFeature.cpp + BinaryContext.cpp + BinaryData.cpp + BinaryEmitter.cpp +diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt +index b8bbe59a6..e9ccea17c 100644 +--- a/bolt/lib/Passes/CMakeLists.txt ++++ b/bolt/lib/Passes/CMakeLists.txt +@@ -13,6 +13,7 @@ add_llvm_library(LLVMBOLTPasses + DataflowInfoManager.cpp + FrameAnalysis.cpp + FrameOptimizer.cpp ++ FeatureMiner.cpp + FixRelaxationPass.cpp + FixRISCVCallsPass.cpp + HFSort.cpp +@@ -41,6 +42,7 @@ add_llvm_library(LLVMBOLTPasses + StackAvailableExpressions.cpp + StackPointerTracking.cpp + StackReachingUses.cpp ++ StaticBranchInfo.cpp + StokeInfo.cpp + TailDuplication.cpp + ThreeWayBranch.cpp +diff --git a/bolt/lib/Passes/FeatureMiner.cpp b/bolt/lib/Passes/FeatureMiner.cpp +new file mode 100644 +index 000000000..d93aef648 +--- /dev/null ++++ b/bolt/lib/Passes/FeatureMiner.cpp +@@ -0,0 +1,572 @@ ++//===--- Passes/FeatureMiner.cpp ------------------------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// A very simple feature extractor based on Calder's paper ++// Evidence-based static branch prediction using machine learning ++// https://dl.acm.org/doi/10.1145/239912.239923 ++//===----------------------------------------------------------------------===// ++ ++#include "bolt/Passes/DataflowInfoManager.h" ++#include "bolt/Passes/FeatureMiner.h" ++#include "bolt/Passes/StaticBranchInfo.h" ++#include "llvm/Support/CommandLine.h" ++#include "llvm/Support/FileSystem.h" ++ ++#undef DEBUG_TYPE ++#define DEBUG_TYPE "bolt-feature-miner" ++ ++using namespace llvm; ++using namespace bolt; ++ ++namespace opts { ++extern cl::opt BlockCorrection; ++ ++} // namespace opts ++ ++namespace llvm { ++namespace bolt { ++ ++class BinaryFunction; ++ ++int8_t FeatureMiner::getProcedureType(BinaryFunction &Function, ++ BinaryContext &BC) { ++ int8_t ProcedureType = 1; ++ for (auto &BB : Function) { ++ for (auto &Inst : BB) { ++ if (BC.MIB->isCall(Inst)) { ++ ProcedureType = 0; // non-leaf type ++ if (const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst)) { ++ const auto *Callee = BC.getFunctionForSymbol(CalleeSymbol); ++ if (Callee && ++ Callee->getFunctionNumber() == Function.getFunctionNumber()) { ++ return 2; // call self type ++ } ++ } ++ } ++ } ++ } ++ return ProcedureType; // leaf type ++} ++ ++void FeatureMiner::addSuccessorInfo(BFIPtr const &BFI, BinaryFunction &Function, ++ BinaryContext &BC, BinaryBasicBlock &BB, ++ bool SuccType) { ++ ++ BinaryBasicBlock *Successor = BB.getConditionalSuccessor(SuccType); ++ ++ if (!Successor) ++ return; ++ ++ unsigned NumCalls{0}; ++ ++ for (auto &Inst : BB) { ++ if (BC.MIB->isCall(Inst)) { ++ ++NumCalls; ++ } ++ } ++ ++ BBIPtr SuccBBInfo = std::make_unique(); ++ ++ // Check if the successor basic block is a loop header and store it. ++ SuccBBInfo->LoopHeader = SBI->isLoopHeader(Successor); ++ ++ SuccBBInfo->BasicBlockSize = Successor->size(); ++ ++ // Check if the edge getting to the successor basic block is a loop ++ // exit edge and store it. ++ SuccBBInfo->Exit = SBI->isExitEdge(&BB, Successor); ++ ++ // Check if the edge getting to the successor basic block is a loop ++ // back edge and store it. ++ SuccBBInfo->Backedge = SBI->isBackEdge(&BB, Successor); ++ ++ MCInst *SuccInst = Successor->getTerminatorBefore(nullptr); ++ ++ // Store information about the branch type ending sucessor basic block ++ SuccBBInfo->EndOpcode = (SuccInst && BC.MIA->isBranch(*SuccInst)) ++ ? SuccInst->getOpcode() ++ : 0; // 0 = NOTHING ++ ++ // Check if the successor basic block contains ++ // a procedure call and store it. ++ SuccBBInfo->Call = (NumCalls > 0) ? 1 // Contains a call instruction ++ : 0; // Does not contain a call instruction ++ ++ uint32_t Offset = BB.getEndOffset(); ++ ++ if (SuccType) { ++ BFI->TrueSuccessor = std::move(SuccBBInfo); ++ // Check if the taken branch is a forward ++ // or a backwards branch and store it ++ BFI->Direction = (Function.isForwardBranch(&BB, Successor) == true) ++ ? 1 // Forward branch ++ : 0; // Backwards branch ++ ++ auto OnlyBranchInfo = BB.getOnlyBranchInfo(); ++ BFI->Count = OnlyBranchInfo.Count; ++ ++ if (Offset) { ++ uint32_t TargetOffset = Successor->getInputOffset(); ++ uint32_t BranchOffset = Offset; ++ if (BranchOffset != UINT32_MAX && TargetOffset != UINT32_MAX) { ++ int64_t Delta = static_cast(TargetOffset) - ++ static_cast(BranchOffset); ++ BFI->DeltaTaken = std::abs(Delta); ++ } ++ } ++ } else { ++ if (BB.succ_size() == 2) { ++ auto FallthroughBranchInfo = BB.getFallthroughBranchInfo(); ++ BFI->FallthroughCount = FallthroughBranchInfo.Count; ++ } else { ++ auto OnlyBranchInfo = BB.getOnlyBranchInfo(); ++ BFI->FallthroughCount = OnlyBranchInfo.Count; ++ } ++ BFI->FalseSuccessor = std::move(SuccBBInfo); ++ } ++} ++ ++void FeatureMiner::extractFeatures(BinaryFunction &Function, ++ BinaryContext &BC) { ++ int8_t ProcedureType = getProcedureType(Function, BC); ++ auto Info = DataflowInfoManager(Function, nullptr, nullptr); ++ const BinaryLoopInfo &LoopsInfo = Function.getLoopInfo(); ++ ++ bool Simple = Function.isSimple(); ++ const auto &Order = Function.dfs(); ++ std::string Function_name = Function.getPrintName(); ++ ++ for (auto *BBA : Order) { ++ ++ auto &BB = *BBA; ++ ++ BinaryBasicBlockFeature BBF = BB.getFeatures(); ++ ++ unsigned TotalLoops{0}; ++ unsigned LoopDepth{0}; ++ unsigned LoopNumBlocks{0}; ++ ++ bool LocalExitingBlock{false}; ++ bool LocalLatchBlock{false}; ++ bool LocalLoopHeader{false}; ++ ++ generateProfileFeatures(&BB, &BBF); ++ ++ BinaryLoop *Loop = LoopsInfo.getLoopFor(&BB); ++ if (Loop) { ++ SmallVector ExitingBlocks; ++ Loop->getExitingBlocks(ExitingBlocks); ++ ++ SmallVector ExitBlocks; ++ Loop->getExitBlocks(ExitBlocks); ++ ++ SmallVector ExitEdges; ++ Loop->getExitEdges(ExitEdges); ++ ++ SmallVector Latches; ++ Loop->getLoopLatches(Latches); ++ ++ TotalLoops = LoopsInfo.TotalLoops; ++ LoopDepth = Loop->getLoopDepth(); ++ LoopNumBlocks = Loop->getNumBlocks(); ++ LocalExitingBlock = Loop->isLoopExiting(&BB); ++ LocalLatchBlock = Loop->isLoopLatch(&BB); ++ LocalLoopHeader = ((Loop->getHeader() == (&BB)) ? 1 : 0); ++ } ++ ++ unsigned NumLoads{0}; ++ unsigned NumCalls{0}; ++ unsigned NumIndirectCalls{0}; ++ ++ for (auto &Inst : BB) { ++ if (BC.MIB->isLoad(Inst)) { ++ ++NumLoads; ++ } else if (BC.MIB->isCall(Inst)) { ++ ++NumCalls; ++ if (BC.MIB->isIndirectCall(Inst)) ++ ++NumIndirectCalls; ++ } ++ } ++ ++ int Index = -2; ++ bool LoopHeader = SBI->isLoopHeader(&BB); ++ ++ BFIPtr BFI = std::make_unique(); ++ ++ BFI->TotalLoops = TotalLoops; ++ BFI->LoopDepth = LoopDepth; ++ BFI->LoopNumBlocks = LoopNumBlocks; ++ BFI->LocalExitingBlock = LocalExitingBlock; ++ BFI->LocalLatchBlock = LocalLatchBlock; ++ BFI->LocalLoopHeader = LocalLoopHeader; ++ BFI->NumCalls = NumCalls; ++ BFI->BasicBlockSize = BB.size(); ++ BFI->NumBasicBlocks = Function.size(); ++ ++ BFI->NumLoads = NumLoads; ++ BFI->NumIndirectCalls = NumIndirectCalls; ++ BFI->LoopHeader = LoopHeader; ++ BFI->ProcedureType = ProcedureType; ++ ++ // Adding taken successor info. ++ addSuccessorInfo(BFI, Function, BC, BB, true); ++ // Adding fall through successor info. ++ addSuccessorInfo(BFI, Function, BC, BB, false); ++ ++ MCInst ConditionalInst; ++ bool hasConditionalBranch = false; ++ MCInst UnconditionalInst; ++ bool hasUnconditionalBranch = false; ++ ++ for (auto &Inst : BB) { ++ ++Index; ++ if (!BC.MIA->isConditionalBranch(Inst) && ++ !BC.MIA->isUnconditionalBranch(Inst)) ++ continue; ++ ++ generateInstFeatures(BC, BB, BFI, Index); ++ ++ if (BC.MIA->isConditionalBranch(Inst)) { ++ ConditionalInst = Inst; ++ hasConditionalBranch = true; ++ } ++ ++ if (BC.MIA->isUnconditionalBranch(Inst)) { ++ UnconditionalInst = Inst; ++ hasUnconditionalBranch = true; ++ } ++ } ++ ++ if (hasConditionalBranch) { ++ BFI->Opcode = ConditionalInst.getOpcode(); ++ ++ } else { ++ if (hasUnconditionalBranch) { ++ BFI->Opcode = UnconditionalInst.getOpcode(); ++ ++ } else { ++ auto Inst = BB.getLastNonPseudoInstr(); ++ BFI->Opcode = Inst->getOpcode(); ++ generateInstFeatures(BC, BB, BFI, Index); ++ } ++ } ++ ++ auto &FalseSuccessor = BFI->FalseSuccessor; ++ auto &TrueSuccessor = BFI->TrueSuccessor; ++ ++ int16_t ProcedureType = (BFI->ProcedureType.has_value()) ++ ? static_cast(*(BFI->ProcedureType)) ++ : -1; ++ ++ int64_t Count = ++ (BFI->Count.has_value()) ? static_cast(*(BFI->Count)) : -1; ++ ++ int64_t FallthroughCount = ++ (BFI->FallthroughCount.has_value()) ++ ? static_cast(*(BFI->FallthroughCount)) ++ : -1; ++ ++ int16_t LoopHeaderValid = (BFI->LoopHeader.has_value()) ++ ? static_cast(*(BFI->LoopHeader)) ++ : -1; ++ ++ int64_t TotalLoopsValid = (BFI->TotalLoops.has_value()) ++ ? static_cast(*(BFI->TotalLoops)) ++ : -1; ++ int64_t LoopDepthValid = (BFI->LoopDepth.has_value()) ++ ? static_cast(*(BFI->LoopDepth)) ++ : -1; ++ int64_t LoopNumBlocksValid = ++ (BFI->LoopNumBlocks.has_value()) ++ ? static_cast(*(BFI->LoopNumBlocks)) ++ : -1; ++ int64_t LocalExitingBlockValid = ++ (BFI->LocalExitingBlock.has_value()) ++ ? static_cast(*(BFI->LocalExitingBlock)) ++ : -1; ++ ++ int64_t LocalLatchBlockValid = ++ (BFI->LocalLatchBlock.has_value()) ++ ? static_cast(*(BFI->LocalLatchBlock)) ++ : -1; ++ ++ int64_t LocalLoopHeaderValid = ++ (BFI->LocalLoopHeader.has_value()) ++ ? static_cast(*(BFI->LocalLoopHeader)) ++ : -1; ++ ++ int32_t CmpOpcode = (BFI->CmpOpcode.has_value()) ++ ? static_cast(*(BFI->CmpOpcode)) ++ : -1; ++ ++ int64_t OperandRAType = (BFI->OperandRAType.has_value()) ++ ? static_cast(*(BFI->OperandRAType)) ++ : 10; ++ ++ int64_t OperandRBType = (BFI->OperandRBType.has_value()) ++ ? static_cast(*(BFI->OperandRBType)) ++ : 10; ++ int16_t Direction = (BFI->Direction.has_value()) ++ ? static_cast(*(BFI->Direction)) ++ : -1; ++ ++ int64_t DeltaTaken = (BFI->DeltaTaken.has_value()) ++ ? static_cast(*(BFI->DeltaTaken)) ++ : -1; ++ ++ int64_t NumLoadsValid = (BFI->NumLoads.has_value()) ++ ? static_cast(*(BFI->NumLoads)) ++ : -1; ++ ++ int64_t BasicBlockSize = (BFI->BasicBlockSize.has_value()) ++ ? static_cast(*(BFI->BasicBlockSize)) ++ : -1; ++ ++ int64_t NumBasicBlocks = (BFI->NumBasicBlocks.has_value()) ++ ? static_cast(*(BFI->NumBasicBlocks)) ++ : -1; ++ ++ int64_t NumCallsValid = (BFI->NumCalls.has_value()) ++ ? static_cast(*(BFI->NumCalls)) ++ : -1; ++ ++ int64_t NumIndirectCallsValid = ++ (BFI->NumIndirectCalls.has_value()) ++ ? static_cast(*(BFI->NumIndirectCalls)) ++ : -1; ++ ++ int64_t HasIndirectCalls = (NumIndirectCallsValid > 0) ? 1 : 0; ++ ++ int32_t Opcode = ++ (BFI->Opcode.has_value()) ? static_cast(*(BFI->Opcode)) : -1; ++ ++ uint64_t fun_exec = Function.getExecutionCount(); ++ fun_exec = (fun_exec != UINT64_MAX) ? fun_exec : 0; ++ ++ BBF.setDirection(Direction); ++ BBF.setDeltaTaken(DeltaTaken); ++ BBF.setOpcode(Opcode); ++ BBF.setCmpOpcode(CmpOpcode); ++ BBF.setOperandRAType(OperandRAType); ++ BBF.setOperandRBType(OperandRBType); ++ BBF.setFunExec(fun_exec); ++ BBF.setTotalLoops(TotalLoopsValid); ++ BBF.setLoopDepth(LoopDepthValid); ++ BBF.setLoopNumBlocks(LoopNumBlocksValid); ++ BBF.setLocalExitingBlock(LocalExitingBlockValid); ++ BBF.setLocalLatchBlock(LocalLatchBlockValid); ++ BBF.setLocalLoopHeader(LocalLoopHeaderValid); ++ BBF.setNumCalls(NumCallsValid); ++ BBF.setBasicBlockSize(BasicBlockSize); ++ BBF.setNumBasicBlocks(NumBasicBlocks); ++ BBF.setNumLoads(NumLoadsValid); ++ BBF.setHasIndirectCalls(HasIndirectCalls); ++ BBF.setLoopHeader(LoopHeaderValid); ++ BBF.setProcedureType(ProcedureType); ++ BBF.setCount(Count); ++ BBF.setFallthroughCount(FallthroughCount); ++ ++ generateSuccessorFeatures(TrueSuccessor, &BBF); ++ generateSuccessorFeatures(FalseSuccessor, &BBF); ++ ++ FalseSuccessor.reset(); ++ TrueSuccessor.reset(); ++ ++ BBF.setInferenceFeatures(); ++ BB.setFeatures(BBF); ++ ++ BFI.reset(); ++ } ++} ++ ++void FeatureMiner::generateInstFeatures(BinaryContext &BC, BinaryBasicBlock &BB, ++ BFIPtr const &BFI, int Index) { ++ ++ // Holds the branch opcode info. ++ ++ BFI->CmpOpcode = 0; ++ if (Index > -1) { ++ auto Cmp = BB.begin() + Index; ++ if (BC.MII->get((*Cmp).getOpcode()).isCompare()) { ++ // Holding the branch comparison opcode info. ++ BFI->CmpOpcode = (*Cmp).getOpcode(); ++ auto getOperandType = [&](const MCOperand &Operand) -> int32_t { ++ if (Operand.isReg()) ++ return 0; ++ else if (Operand.isImm()) ++ return 1; ++ else if (Operand.isSFPImm()) ++ return 2; ++ else if (Operand.isExpr()) ++ return 3; ++ else ++ return -1; ++ }; ++ ++ const auto InstInfo = BC.MII->get((*Cmp).getOpcode()); ++ unsigned NumDefs = InstInfo.getNumDefs(); ++ int32_t NumPrimeOperands = MCPlus::getNumPrimeOperands(*Cmp) - NumDefs; ++ switch (NumPrimeOperands) { ++ case 6: { ++ int32_t RBType = getOperandType((*Cmp).getOperand(NumDefs)); ++ int32_t RAType = getOperandType((*Cmp).getOperand(NumDefs + 1)); ++ ++ if (RBType == 0 && RAType == 0) { ++ BFI->OperandRBType = RBType; ++ BFI->OperandRAType = RAType; ++ } else if (RBType == 0 && (RAType == 1 || RAType == 2)) { ++ RAType = getOperandType((*Cmp).getOperand(NumPrimeOperands - 1)); ++ ++ if (RAType != 1 && RAType != 2) { ++ RAType = -1; ++ } ++ ++ BFI->OperandRBType = RBType; ++ BFI->OperandRAType = RAType; ++ } else { ++ BFI->OperandRAType = -1; ++ BFI->OperandRBType = -1; ++ } ++ break; ++ } ++ case 2: ++ BFI->OperandRBType = getOperandType((*Cmp).getOperand(NumDefs)); ++ BFI->OperandRAType = getOperandType((*Cmp).getOperand(NumDefs + 1)); ++ break; ++ case 3: ++ BFI->OperandRBType = getOperandType((*Cmp).getOperand(NumDefs)); ++ BFI->OperandRAType = getOperandType((*Cmp).getOperand(NumDefs + 2)); ++ break; ++ case 1: ++ BFI->OperandRAType = getOperandType((*Cmp).getOperand(NumDefs)); ++ break; ++ default: ++ BFI->OperandRAType = -1; ++ BFI->OperandRBType = -1; ++ break; ++ } ++ ++ } else { ++ Index -= 1; ++ for (int Idx = Index; Idx > -1; Idx--) { ++ auto Cmp = BB.begin() + Idx; ++ if (BC.MII->get((*Cmp).getOpcode()).isCompare()) { ++ // Holding the branch comparison opcode info. ++ BFI->CmpOpcode = (*Cmp).getOpcode(); ++ break; ++ } ++ } ++ } ++ } ++} ++ ++void FeatureMiner::generateSuccessorFeatures(BBIPtr &Successor, ++ BinaryBasicBlockFeature *BBF) { ++ ++ int16_t LoopHeader = (Successor->LoopHeader.has_value()) ++ ? static_cast(*(Successor->LoopHeader)) ++ : -1; ++ ++ int16_t Backedge = (Successor->Backedge.has_value()) ++ ? static_cast(*(Successor->Backedge)) ++ : -1; ++ ++ int16_t Exit = (Successor->Exit.has_value()) ++ ? static_cast(*(Successor->Exit)) ++ : -1; ++ ++ int16_t Call = (Successor->Call.has_value()) ++ ? static_cast(*(Successor->Call)) ++ : -1; ++ ++ int32_t EndOpcode = (Successor->EndOpcode.has_value()) ++ ? static_cast(*(Successor->EndOpcode)) ++ : -1; ++ ++ int64_t BasicBlockSize = ++ (Successor->BasicBlockSize.has_value()) ++ ? static_cast(*(Successor->BasicBlockSize)) ++ : -1; ++ ++ BBF->setEndOpcodeVec(EndOpcode); ++ BBF->setLoopHeaderVec(LoopHeader); ++ BBF->setBackedgeVec(Backedge); ++ BBF->setExitVec(Exit); ++ BBF->setCallVec(Call); ++ BBF->setBasicBlockSizeVec(BasicBlockSize); ++} ++ ++void FeatureMiner::runOnFunctions(BinaryContext &BC) {} ++ ++void FeatureMiner::inferenceFeatures(BinaryFunction &Function) { ++ ++ SBI = std::make_unique(); ++ ++ if (Function.empty()) ++ return; ++ ++ if (!Function.isLoopFree()) { ++ const BinaryLoopInfo &LoopsInfo = Function.getLoopInfo(); ++ SBI->findLoopEdgesInfo(LoopsInfo); ++ } ++ ++ BinaryContext &BC = Function.getBinaryContext(); ++ extractFeatures(Function, BC); ++ ++ SBI->clear(); ++} ++ ++void FeatureMiner::generateProfileFeatures(BinaryBasicBlock *BB, ++ BinaryBasicBlockFeature *BBF) { ++ int32_t parentChildNum, parentCount, childParentNum, childCount; ++ ++ if (BB->getParentSet().size() == 0) { ++ parentChildNum = -1; ++ parentCount = -1; ++ } else { ++ parentChildNum = std::numeric_limits::max(); ++ parentCount = 0; ++ for (BinaryBasicBlock *parent : BB->getParentSet()) { ++ if (parent->getChildrenSet().size() < parentChildNum) { ++ parentChildNum = parent->getChildrenSet().size(); ++ parentCount = parent->getExecutionCount(); ++ } else if (parent->getChildrenSet().size() == parentChildNum && ++ parent->getExecutionCount() > parentCount) { ++ parentCount = parent->getExecutionCount(); ++ } ++ } ++ } ++ ++ if (BB->getChildrenSet().size() == 0) { ++ childParentNum = -1; ++ childCount = -1; ++ } else { ++ childParentNum = std::numeric_limits::max(); ++ childCount = 0; ++ for (BinaryBasicBlock *child : BB->getChildrenSet()) { ++ if (child->getParentSet().size() < childParentNum) { ++ childParentNum = child->getParentSet().size(); ++ childCount = child->getExecutionCount(); ++ } else if (child->getParentSet().size() == childParentNum && ++ child->getExecutionCount() > childCount) { ++ childCount = child->getExecutionCount(); ++ } ++ } ++ } ++ ++ int64_t parentCountCatch = parentCount > 0 ? 1 : 0; ++ int64_t childCountCatch = childCount > 0 ? 1 : 0; ++ ++ BBF->setParentChildNum(parentChildNum); ++ BBF->setParentCount(parentCountCatch); ++ BBF->setChildParentNum(childParentNum); ++ BBF->setChildCount(childCountCatch); ++} ++ ++} // namespace bolt ++} // namespace llvm +\ No newline at end of file +diff --git a/bolt/lib/Passes/StaticBranchInfo.cpp b/bolt/lib/Passes/StaticBranchInfo.cpp +new file mode 100644 +index 000000000..585dbcae2 +--- /dev/null ++++ b/bolt/lib/Passes/StaticBranchInfo.cpp +@@ -0,0 +1,143 @@ ++//===------ Passes/StaticBranchInfo.cpp -----------------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This is an auxiliary class to the feature miner, static branch probability ++// and frequency passes. This class is responsible for finding loop info (loop ++// back edges, loop exit edges and loop headers) of a function. It also finds ++// basic block info (if a block contains store and call instructions) and if a ++// basic block contains a call to the exit. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "bolt/Core/BinaryBasicBlock.h" ++#include "bolt/Core/BinaryLoop.h" ++#include "bolt/Passes/StaticBranchInfo.h" ++ ++namespace llvm { ++namespace bolt { ++ ++void StaticBranchInfo::findLoopEdgesInfo(const BinaryLoopInfo &LoopsInfo) { ++ // Traverse discovered loops ++ std::stack Loops; ++ for (BinaryLoop *BL : LoopsInfo) ++ Loops.push(BL); ++ ++ while (!Loops.empty()) { ++ BinaryLoop *Loop = Loops.top(); ++ Loops.pop(); ++ BinaryBasicBlock *LoopHeader = Loop->getHeader(); ++ LoopHeaders.insert(LoopHeader); ++ ++ // Add nested loops in the stack. ++ for (BinaryLoop::iterator I = Loop->begin(), E = Loop->end(); I != E; ++I) { ++ Loops.push(*I); ++ } ++ ++ SmallVector Latches; ++ Loop->getLoopLatches(Latches); ++ ++ // Find back edges. ++ for (BinaryBasicBlock *Latch : Latches) { ++ for (BinaryBasicBlock *Succ : Latch->successors()) { ++ if (Succ == LoopHeader) { ++ Edge CFGEdge = std::make_pair(Latch->getLabel(), Succ->getLabel()); ++ BackEdges.insert(CFGEdge); ++ } ++ } ++ } ++ ++ // Find exit edges. ++ SmallVector AuxExitEdges; ++ Loop->getExitEdges(AuxExitEdges); ++ for (BinaryLoop::Edge &Exit : AuxExitEdges) { ++ ExitEdges.insert(Exit); ++ } ++ } ++} ++ ++void StaticBranchInfo::findBasicBlockInfo(const BinaryFunction &Function, ++ BinaryContext &BC) { ++ for (auto &BB : Function) { ++ for (auto &Inst : BB) { ++ if (BC.MIB->isCall(Inst)) ++ CallSet.insert(&BB); ++ else if (BC.MIB->isStore(Inst)) ++ StoreSet.insert(&BB); ++ } ++ } ++} ++ ++bool StaticBranchInfo::isBackEdge(const Edge &CFGEdge) const { ++ return BackEdges.count(CFGEdge); ++} ++ ++bool StaticBranchInfo::isBackEdge(const BinaryBasicBlock *SrcBB, ++ const BinaryBasicBlock *DstBB) const { ++ const Edge CFGEdge = std::make_pair(SrcBB->getLabel(), DstBB->getLabel()); ++ return isBackEdge(CFGEdge); ++} ++ ++bool StaticBranchInfo::isExitEdge(const BinaryLoop::Edge &CFGEdge) const { ++ return ExitEdges.count(CFGEdge); ++} ++ ++bool StaticBranchInfo::isExitEdge(const BinaryBasicBlock *SrcBB, ++ const BinaryBasicBlock *DstBB) const { ++ const BinaryLoop::Edge CFGEdge = ++ std::make_pair(const_cast(SrcBB), ++ const_cast(DstBB)); ++ return isExitEdge(CFGEdge); ++} ++ ++bool StaticBranchInfo::isLoopHeader(const BinaryBasicBlock *BB) const { ++ return LoopHeaders.count(BB); ++} ++ ++bool StaticBranchInfo::hasCallInst(const BinaryBasicBlock *BB) const { ++ return CallSet.count(BB); ++} ++ ++bool StaticBranchInfo::hasStoreInst(const BinaryBasicBlock *BB) const { ++ return StoreSet.count(BB); ++} ++ ++unsigned StaticBranchInfo::countBackEdges(BinaryBasicBlock *BB) const { ++ unsigned CountEdges = 0; ++ ++ for (BinaryBasicBlock *SuccBB : BB->successors()) { ++ const Edge CFGEdge = std::make_pair(BB->getLabel(), SuccBB->getLabel()); ++ if (BackEdges.count(CFGEdge)) ++ ++CountEdges; ++ } ++ ++ return CountEdges; ++} ++ ++unsigned StaticBranchInfo::countExitEdges(BinaryBasicBlock *BB) const { ++ unsigned CountEdges = 0; ++ ++ for (BinaryBasicBlock *SuccBB : BB->successors()) { ++ const BinaryLoop::Edge CFGEdge = std::make_pair(BB, SuccBB); ++ if (ExitEdges.count(CFGEdge)) ++ ++CountEdges; ++ } ++ ++ return CountEdges; ++} ++ ++void StaticBranchInfo::clear() { ++ LoopHeaders.clear(); ++ BackEdges.clear(); ++ ExitEdges.clear(); ++ CallSet.clear(); ++ StoreSet.clear(); ++} ++ ++} // namespace bolt ++} // namespace llvm +diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp +index 0e12e8cb3..447b71fe7 100644 +--- a/bolt/lib/Profile/DataReader.cpp ++++ b/bolt/lib/Profile/DataReader.cpp +@@ -12,13 +12,16 @@ + //===----------------------------------------------------------------------===// + + #include "bolt/Profile/DataReader.h" ++#include "bolt/Passes/FeatureMiner.h" + #include "bolt/Core/BinaryFunction.h" + #include "bolt/Passes/MCF.h" + #include "bolt/Utils/Utils.h" + #include "llvm/Support/CommandLine.h" + #include "llvm/Support/Debug.h" + #include "llvm/Support/Errc.h" ++#include + #include ++#include + + #undef DEBUG_TYPE + #define DEBUG_TYPE "bolt-prof" +@@ -26,15 +29,23 @@ + using namespace llvm; + + namespace opts { +- ++extern cl::opt BlockCorrection; + extern cl::OptionCategory BoltCategory; + extern llvm::cl::opt Verbosity; + +-static cl::opt +-DumpData("dump-data", +- cl::desc("dump parsed bolt data for debugging"), +- cl::Hidden, +- cl::cat(BoltCategory)); ++static cl::opt InputModelFilename("model-path", ++ cl::desc(""), ++ cl::Optional, ++ cl::cat(BoltCategory)); ++ ++static cl::opt AnnotateThreshold( ++ "annotate-threshold", ++ cl::desc(""), ++ cl::init(0.85f), cl::Optional, cl::cat(BoltCategory)); ++ ++static cl::opt DumpData("dump-data", ++ cl::desc("dump parsed bolt data for debugging"), ++ cl::Hidden, cl::cat(BoltCategory)); + + } // namespace opts + +@@ -311,6 +322,17 @@ Error DataReader::readProfilePreCFG(BinaryContext &BC) { + } + + Error DataReader::readProfile(BinaryContext &BC) { ++ ++ if (opts::BlockCorrection) { ++ if (opts::InputModelFilename.empty()) { ++ outs() << "error: llvm-bolt expected -model-path= option.\n"; ++ exit(1); ++ } else { ++ DataReader::initializeONNXRunner(opts::InputModelFilename); ++ DataReader::setThreshold(opts::AnnotateThreshold); ++ } ++ } ++ + for (auto &BFI : BC.getBinaryFunctions()) { + BinaryFunction &Function = BFI.second; + readProfile(Function); +@@ -324,6 +346,12 @@ Error DataReader::readProfile(BinaryContext &BC) { + } + BC.setNumUnusedProfiledObjects(NumUnused); + ++ if (opts::BlockCorrection) { ++ uint64_t modified_total = DataReader::getModifiedBBTotal(); ++ outs() << "BOLT-INFO: total modified CFG BB count number is " ++ << modified_total << ".\n"; ++ } ++ + return Error::success(); + } + +@@ -555,6 +583,75 @@ float DataReader::evaluateProfileData(BinaryFunction &BF, + return MatchRatio; + } + ++void generateChildrenParentCount(BinaryBasicBlock *BB) { ++ typedef GraphTraits GraphT; ++ ++ for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB), ++ E = GraphT::child_end(BB); ++ CI != E; ++CI) { ++ typename GraphT::NodeRef Child = *CI; ++ BB->insertChildrenSet(Child); ++ Child->insertParentSet(BB); ++ } ++} ++ ++void generateChildrenParentCount(BinaryFunction &BF) { ++ for (BinaryBasicBlock &BB : BF) { ++ generateChildrenParentCount(&BB); ++ } ++} ++ ++uint64_t estimateBBCount(DataReader *dataReaderRef, BinaryBasicBlock *BB, ++ float threshold) { ++ uint64_t modified = 0; ++ if (BB->getExecutionCount() != 0) { ++ return modified; ++ } ++ ++ std::vector input_string; ++ std::vector input_int64; ++ std::vector input_float; ++ ++ BinaryBasicBlockFeature BBF = BB->getFeatures(); ++ input_int64 = BBF.getInferenceFeatures(); ++ ++ if (input_int64.empty()) { ++ return 0; ++ } ++ ++ float model_pred = ++ dataReaderRef->ONNXInference(input_string, input_int64, input_float); ++ if (model_pred >= threshold) { ++ uint64_t min_neighbor_count = std::numeric_limits::max(); ++ for (BinaryBasicBlock *parent : BB->getParentSet()) { ++ if (parent->getExecutionCount() > 0 && ++ parent->getExecutionCount() < min_neighbor_count) ++ min_neighbor_count = parent->getExecutionCount(); ++ } ++ for (BinaryBasicBlock *child : BB->getChildrenSet()) { ++ if (child->getExecutionCount() > 0 && ++ child->getExecutionCount() < min_neighbor_count) ++ min_neighbor_count = child->getExecutionCount(); ++ } ++ if (min_neighbor_count != std::numeric_limits::max()) { ++ BB->setExecutionCount(min_neighbor_count); ++ modified = 1; ++ } ++ } ++ return modified; ++} ++ ++uint64_t estimateBBCount(DataReader *dataReaderRef, BinaryFunction &BF, ++ float threshold) { ++ uint64_t modified_total_func = 0; ++ const auto &Order = BF.dfs(); ++ for (auto *BBA : Order) { ++ auto &BB = *BBA; ++ modified_total_func += estimateBBCount(dataReaderRef, &BB, threshold); ++ } ++ return modified_total_func; ++} ++ + void DataReader::readSampleData(BinaryFunction &BF) { + FuncSampleData *SampleDataOrErr = getFuncSampleData(BF.getNames()); + if (!SampleDataOrErr) +@@ -600,6 +697,17 @@ void DataReader::readSampleData(BinaryFunction &BF) { + + BF.ExecutionCount = TotalEntryCount; + ++ if (opts::BlockCorrection) { ++ generateChildrenParentCount(BF); ++ std::unique_ptr FM = ++ std::make_unique(opts::BlockCorrection); ++ FM->inferenceFeatures(BF); ++ ++ float threshold = DataReader::getThreshold(); ++ uint64_t modified_total_func = estimateBBCount(this, BF, threshold); ++ DataReader::addModifiedBBTotal(modified_total_func); ++ } ++ + estimateEdgeCounts(BF); + } + +diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp +index c6ea0b009..4191e18bd 100644 +--- a/bolt/lib/Rewrite/RewriteInstance.cpp ++++ b/bolt/lib/Rewrite/RewriteInstance.cpp +@@ -106,6 +106,12 @@ cl::opt DumpDotAll( + "enable '-print-loops' for color-coded blocks"), + cl::Hidden, cl::cat(BoltCategory)); + ++cl::opt BlockCorrection( ++ "block-correction", ++ cl::desc("capture features useful for ML model to inference the count on the binary basic block" ++ " and correct them on CFG."), ++ cl::ZeroOrMore, cl::cat(BoltOptCategory)); ++ + static cl::list + ForceFunctionNames("funcs", + cl::CommaSeparated, +-- +2.39.3 (Apple Git-146) + diff --git a/llvm-bolt.spec b/llvm-bolt.spec index bc6ad9d..fd0a8f9 100644 --- a/llvm-bolt.spec +++ b/llvm-bolt.spec @@ -22,7 +22,7 @@ Name: %{pkg_name} Version: %{bolt_version} -Release: 6 +Release: 7 Summary: BOLT is a post-link optimizer developed to speed up large applications License: Apache 2.0 URL: https://github.com/llvm/llvm-project/tree/main/bolt @@ -35,6 +35,7 @@ Patch2: 0002-Add-test-for-emitting-trap-value.patch Patch3: 0003-AArch64-Add-AArch64-support-for-inline.patch Patch4: 0004-Bolt-Solving-pie-support-issue.patch Patch5: 0005-BOLT-AArch64-Don-t-change-layout-in-PatchEntries.patch +Patch6: 0006-AArch64-Add-CFG-block-count-correction-optimization.patch BuildRequires: gcc BuildRequires: gcc-c++ @@ -146,6 +147,12 @@ rm -f %{buildroot}/%{_builddir}/%{bolt_srcdir}/%{_vpath_builddir}/%{_lib}/lib*.a %doc %{install_docdir} %changelog +* Fri Jul 12 2024 rfwang07 17.0.6-7 +- Type:Feature +- ID:NA +- SUG:NA +- DESC: Add CFG block count correction optimization. + * Fri Jun 21 2024 rfwang07 17.0.6-6 - Type:Backport - ID:NA