1 //===- DFAEmitter.cpp - Finite state automaton emitter --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class can produce a generic deterministic finite state automaton (DFA), 10 // given a set of possible states and transitions. 11 // 12 // The input transitions can be nondeterministic - this class will produce the 13 // deterministic equivalent state machine. 14 // 15 // The generated code can run the DFA and produce an accepted / not accepted 16 // state and also produce, given a sequence of transitions that results in an 17 // accepted state, the sequence of intermediate states. This is useful if the 18 // initial automaton was nondeterministic - it allows mapping back from the DFA 19 // to the NFA. 20 // 21 //===----------------------------------------------------------------------===// 22 #define DEBUG_TYPE "dfa-emitter" 23 24 #include "DFAEmitter.h" 25 #include "CodeGenTarget.h" 26 #include "SequenceToOffsetTable.h" 27 #include "TableGenBackends.h" 28 #include "llvm/ADT/SmallVector.h" 29 #include "llvm/ADT/StringExtras.h" 30 #include "llvm/ADT/UniqueVector.h" 31 #include "llvm/Support/Debug.h" 32 #include "llvm/Support/raw_ostream.h" 33 #include "llvm/TableGen/Record.h" 34 #include "llvm/TableGen/TableGenBackend.h" 35 #include <cassert> 36 #include <cstdint> 37 #include <map> 38 #include <set> 39 #include <string> 40 #include <vector> 41 42 using namespace llvm; 43 44 //===----------------------------------------------------------------------===// 45 // DfaEmitter implementation. This is independent of the GenAutomaton backend. 46 //===----------------------------------------------------------------------===// 47 48 void DfaEmitter::addTransition(state_type From, state_type To, action_type A) { 49 Actions.insert(A); 50 NfaStates.insert(From); 51 NfaStates.insert(To); 52 NfaTransitions[{From, A}].push_back(To); 53 ++NumNfaTransitions; 54 } 55 56 void DfaEmitter::visitDfaState(DfaState DS) { 57 // For every possible action... 58 auto FromId = DfaStates.idFor(DS); 59 for (action_type A : Actions) { 60 DfaState NewStates; 61 DfaTransitionInfo TI; 62 // For every represented state, word pair in the original NFA... 63 for (state_type &FromState : DS) { 64 // If this action is possible from this state add the transitioned-to 65 // states to NewStates. 66 auto I = NfaTransitions.find({FromState, A}); 67 if (I == NfaTransitions.end()) 68 continue; 69 for (state_type &ToState : I->second) { 70 NewStates.push_back(ToState); 71 TI.emplace_back(FromState, ToState); 72 } 73 } 74 if (NewStates.empty()) 75 continue; 76 // Sort and unique. 77 sort(NewStates); 78 NewStates.erase(std::unique(NewStates.begin(), NewStates.end()), 79 NewStates.end()); 80 sort(TI); 81 TI.erase(std::unique(TI.begin(), TI.end()), TI.end()); 82 unsigned ToId = DfaStates.insert(NewStates); 83 DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI)); 84 } 85 } 86 87 void DfaEmitter::constructDfa() { 88 DfaState Initial(1, /*NFA initial state=*/0); 89 DfaStates.insert(Initial); 90 91 // Note that UniqueVector starts indices at 1, not zero. 92 unsigned DfaStateId = 1; 93 while (DfaStateId <= DfaStates.size()) 94 visitDfaState(DfaStates[DfaStateId++]); 95 } 96 97 void DfaEmitter::emit(StringRef Name, raw_ostream &OS) { 98 constructDfa(); 99 100 OS << "// Input NFA has " << NfaStates.size() << " states with " 101 << NumNfaTransitions << " transitions.\n"; 102 OS << "// Generated DFA has " << DfaStates.size() << " states with " 103 << DfaTransitions.size() << " transitions.\n\n"; 104 105 // Implementation note: We don't bake a simple std::pair<> here as it requires 106 // significantly more effort to parse. A simple test with a large array of 107 // struct-pairs (N=100000) took clang-10 6s to parse. The same array of 108 // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to 109 // define the pair type. 110 // 111 // FIXME: It may make sense to emit these as ULEB sequences instead of 112 // pairs of uint64_t. 113 OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n"; 114 OS << "// transition implies a set of NFA transitions. These are referred\n"; 115 OS << "// to by index in " << Name << "Transitions[].\n"; 116 117 SequenceToOffsetTable<DfaTransitionInfo> Table; 118 std::map<DfaTransitionInfo, unsigned> EmittedIndices; 119 for (auto &T : DfaTransitions) 120 Table.add(T.second.second); 121 Table.layout(); 122 OS << "std::array<NfaStatePair, " << Table.size() << "> " << Name 123 << "TransitionInfo = {{\n"; 124 Table.emit( 125 OS, 126 [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) { 127 OS << "{" << P.first << ", " << P.second << "}"; 128 }, 129 "{0ULL, 0ULL}"); 130 131 OS << "}};\n\n"; 132 133 OS << "// A transition in the generated " << Name << " DFA.\n"; 134 OS << "struct " << Name << "Transition {\n"; 135 OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n"; 136 OS << " "; 137 printActionType(OS); 138 OS << " Action; // The input symbol that causes this transition.\n"; 139 OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n"; 140 OS << " unsigned InfoIdx; // Start index into " << Name 141 << "TransitionInfo.\n"; 142 OS << "};\n\n"; 143 144 OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n"; 145 OS << "// The initial state is 1, not zero.\n"; 146 OS << "std::array<" << Name << "Transition, " << DfaTransitions.size() << "> " 147 << Name << "Transitions = {{\n"; 148 for (auto &KV : DfaTransitions) { 149 dfa_state_type From = KV.first.first; 150 dfa_state_type To = KV.second.first; 151 action_type A = KV.first.second; 152 unsigned InfoIdx = Table.get(KV.second.second); 153 OS << " {" << From << ", "; 154 printActionValue(A, OS); 155 OS << ", " << To << ", " << InfoIdx << "},\n"; 156 } 157 OS << "\n}};\n\n"; 158 } 159 160 void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; } 161 162 void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; } 163 164 //===----------------------------------------------------------------------===// 165 // AutomatonEmitter implementation 166 //===----------------------------------------------------------------------===// 167 168 namespace { 169 // FIXME: This entire discriminated union could be removed with c++17: 170 // using Action = std::variant<Record *, unsigned, std::string>; 171 struct Action { 172 Record *R = nullptr; 173 unsigned I = 0; 174 std::string S = nullptr; 175 176 Action() = default; 177 Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {} 178 179 void print(raw_ostream &OS) const { 180 if (R) 181 OS << R->getName(); 182 else if (!S.empty()) 183 OS << '"' << S << '"'; 184 else 185 OS << I; 186 } 187 bool operator<(const Action &Other) const { 188 return std::make_tuple(R, I, S) < 189 std::make_tuple(Other.R, Other.I, Other.S); 190 } 191 }; 192 193 using ActionTuple = std::vector<Action>; 194 class Automaton; 195 196 class Transition { 197 uint64_t NewState; 198 // The tuple of actions that causes this transition. 199 ActionTuple Actions; 200 // The types of the actions; this is the same across all transitions. 201 SmallVector<std::string, 4> Types; 202 203 public: 204 Transition(Record *R, Automaton *Parent); 205 const ActionTuple &getActions() { return Actions; } 206 SmallVector<std::string, 4> getTypes() { return Types; } 207 208 bool canTransitionFrom(uint64_t State); 209 uint64_t transitionFrom(uint64_t State); 210 }; 211 212 class Automaton { 213 RecordKeeper &Records; 214 Record *R; 215 std::vector<Transition> Transitions; 216 /// All possible action tuples, uniqued. 217 UniqueVector<ActionTuple> Actions; 218 /// The fields within each Transition object to find the action symbols. 219 std::vector<StringRef> ActionSymbolFields; 220 221 public: 222 Automaton(RecordKeeper &Records, Record *R); 223 void emit(raw_ostream &OS); 224 225 ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; } 226 /// If the type of action A has been overridden (there exists a field 227 /// "TypeOf_A") return that, otherwise return the empty string. 228 StringRef getActionSymbolType(StringRef A); 229 }; 230 231 class AutomatonEmitter { 232 RecordKeeper &Records; 233 234 public: 235 AutomatonEmitter(RecordKeeper &R) : Records(R) {} 236 void run(raw_ostream &OS); 237 }; 238 239 /// A DfaEmitter implementation that can print our variant action type. 240 class CustomDfaEmitter : public DfaEmitter { 241 const UniqueVector<ActionTuple> &Actions; 242 std::string TypeName; 243 244 public: 245 CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName) 246 : Actions(Actions), TypeName(TypeName) {} 247 248 void printActionType(raw_ostream &OS) override; 249 void printActionValue(action_type A, raw_ostream &OS) override; 250 }; 251 } // namespace 252 253 void AutomatonEmitter::run(raw_ostream &OS) { 254 for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) { 255 Automaton A(Records, R); 256 OS << "#ifdef GET_" << R->getName() << "_DECL\n"; 257 A.emit(OS); 258 OS << "#endif // GET_" << R->getName() << "_DECL\n"; 259 } 260 } 261 262 Automaton::Automaton(RecordKeeper &Records, Record *R) 263 : Records(Records), R(R) { 264 LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n"); 265 ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields"); 266 } 267 268 void Automaton::emit(raw_ostream &OS) { 269 StringRef TransitionClass = R->getValueAsString("TransitionClass"); 270 for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) { 271 assert(T->isSubClassOf("Transition")); 272 Transitions.emplace_back(T, this); 273 Actions.insert(Transitions.back().getActions()); 274 } 275 276 LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size() 277 << "\n"); 278 LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size() 279 << " potential transitions.\n"); 280 281 StringRef Name = R->getName(); 282 283 CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action"); 284 // Starting from the initial state, build up a list of possible states and 285 // transitions. 286 std::deque<uint64_t> Worklist(1, 0); 287 std::set<uint64_t> SeenStates; 288 unsigned NumTransitions = 0; 289 SeenStates.insert(Worklist.front()); 290 while (!Worklist.empty()) { 291 uint64_t State = Worklist.front(); 292 Worklist.pop_front(); 293 for (Transition &T : Transitions) { 294 if (!T.canTransitionFrom(State)) 295 continue; 296 uint64_t NewState = T.transitionFrom(State); 297 if (SeenStates.emplace(NewState).second) 298 Worklist.emplace_back(NewState); 299 ++NumTransitions; 300 Emitter.addTransition(State, NewState, Actions.idFor(T.getActions())); 301 } 302 } 303 LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size() 304 << " states with " << NumTransitions << " transitions.\n"); 305 306 const auto &ActionTypes = Transitions.back().getTypes(); 307 OS << "// The type of an action in the " << Name << " automaton.\n"; 308 if (ActionTypes.size() == 1) { 309 OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n"; 310 } else { 311 OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ") 312 << ">;\n"; 313 } 314 OS << "\n"; 315 316 Emitter.emit(Name, OS); 317 } 318 319 StringRef Automaton::getActionSymbolType(StringRef A) { 320 Twine Ty = "TypeOf_" + A; 321 if (!R->getValue(Ty.str())) 322 return ""; 323 return R->getValueAsString(Ty.str()); 324 } 325 326 Transition::Transition(Record *R, Automaton *Parent) { 327 BitsInit *NewStateInit = R->getValueAsBitsInit("NewState"); 328 NewState = 0; 329 assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 && 330 "State cannot be represented in 64 bits!"); 331 for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) { 332 if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) { 333 if (Bit->getValue()) 334 NewState |= 1ULL << I; 335 } 336 } 337 338 for (StringRef A : Parent->getActionSymbolFields()) { 339 RecordVal *SymbolV = R->getValue(A); 340 if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) { 341 Actions.emplace_back(R->getValueAsDef(A), 0, ""); 342 Types.emplace_back(Ty->getAsString()); 343 } else if (isa<IntRecTy>(SymbolV->getType())) { 344 Actions.emplace_back(nullptr, R->getValueAsInt(A), ""); 345 Types.emplace_back("unsigned"); 346 } else if (isa<StringRecTy>(SymbolV->getType()) || 347 isa<CodeRecTy>(SymbolV->getType())) { 348 Actions.emplace_back(nullptr, 0, R->getValueAsString(A)); 349 Types.emplace_back("std::string"); 350 } else { 351 report_fatal_error("Unhandled symbol type!"); 352 } 353 354 StringRef TypeOverride = Parent->getActionSymbolType(A); 355 if (!TypeOverride.empty()) 356 Types.back() = TypeOverride; 357 } 358 } 359 360 bool Transition::canTransitionFrom(uint64_t State) { 361 if ((State & NewState) == 0) 362 // The bits we want to set are not set; 363 return true; 364 return false; 365 } 366 367 uint64_t Transition::transitionFrom(uint64_t State) { 368 return State | NewState; 369 } 370 371 void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; } 372 373 void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) { 374 const ActionTuple &AT = Actions[A]; 375 if (AT.size() > 1) 376 OS << "std::make_tuple("; 377 bool First = true; 378 for (const auto &SingleAction : AT) { 379 if (!First) 380 OS << ", "; 381 First = false; 382 SingleAction.print(OS); 383 } 384 if (AT.size() > 1) 385 OS << ")"; 386 } 387 388 namespace llvm { 389 390 void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) { 391 AutomatonEmitter(RK).run(OS); 392 } 393 394 } // namespace llvm 395