1 //===- DFAEmitter.cpp - Finite state automaton emitter --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class can produce a generic deterministic finite state automaton (DFA), 10 // given a set of possible states and transitions. 11 // 12 // The input transitions can be nondeterministic - this class will produce the 13 // deterministic equivalent state machine. 14 // 15 // The generated code can run the DFA and produce an accepted / not accepted 16 // state and also produce, given a sequence of transitions that results in an 17 // accepted state, the sequence of intermediate states. This is useful if the 18 // initial automaton was nondeterministic - it allows mapping back from the DFA 19 // to the NFA. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "DFAEmitter.h" 24 #include "SequenceToOffsetTable.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/StringExtras.h" 27 #include "llvm/ADT/UniqueVector.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/TableGen/Record.h" 31 #include "llvm/TableGen/TableGenBackend.h" 32 #include <cassert> 33 #include <cstdint> 34 #include <deque> 35 #include <map> 36 #include <set> 37 #include <string> 38 #include <variant> 39 #include <vector> 40 41 #define DEBUG_TYPE "dfa-emitter" 42 43 using namespace llvm; 44 45 //===----------------------------------------------------------------------===// 46 // DfaEmitter implementation. This is independent of the GenAutomaton backend. 47 //===----------------------------------------------------------------------===// 48 49 void DfaEmitter::addTransition(state_type From, state_type To, action_type A) { 50 Actions.insert(A); 51 NfaStates.insert(From); 52 NfaStates.insert(To); 53 NfaTransitions[{From, A}].push_back(To); 54 ++NumNfaTransitions; 55 } 56 57 void DfaEmitter::visitDfaState(const DfaState &DS) { 58 // For every possible action... 59 auto FromId = DfaStates.idFor(DS); 60 for (action_type A : Actions) { 61 DfaState NewStates; 62 DfaTransitionInfo TI; 63 // For every represented state, word pair in the original NFA... 64 for (state_type FromState : DS) { 65 // If this action is possible from this state add the transitioned-to 66 // states to NewStates. 67 auto I = NfaTransitions.find({FromState, A}); 68 if (I == NfaTransitions.end()) 69 continue; 70 for (state_type &ToState : I->second) { 71 NewStates.push_back(ToState); 72 TI.emplace_back(FromState, ToState); 73 } 74 } 75 if (NewStates.empty()) 76 continue; 77 // Sort and unique. 78 sort(NewStates); 79 NewStates.erase(std::unique(NewStates.begin(), NewStates.end()), 80 NewStates.end()); 81 sort(TI); 82 TI.erase(std::unique(TI.begin(), TI.end()), TI.end()); 83 unsigned ToId = DfaStates.insert(NewStates); 84 DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI)); 85 } 86 } 87 88 void DfaEmitter::constructDfa() { 89 DfaState Initial(1, /*NFA initial state=*/0); 90 DfaStates.insert(Initial); 91 92 // Note that UniqueVector starts indices at 1, not zero. 93 unsigned DfaStateId = 1; 94 while (DfaStateId <= DfaStates.size()) { 95 DfaState S = DfaStates[DfaStateId]; 96 visitDfaState(S); 97 DfaStateId++; 98 } 99 } 100 101 void DfaEmitter::emit(StringRef Name, raw_ostream &OS) { 102 constructDfa(); 103 104 OS << "// Input NFA has " << NfaStates.size() << " states with " 105 << NumNfaTransitions << " transitions.\n"; 106 OS << "// Generated DFA has " << DfaStates.size() << " states with " 107 << DfaTransitions.size() << " transitions.\n\n"; 108 109 // Implementation note: We don't bake a simple std::pair<> here as it requires 110 // significantly more effort to parse. A simple test with a large array of 111 // struct-pairs (N=100000) took clang-10 6s to parse. The same array of 112 // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to 113 // define the pair type. 114 // 115 // FIXME: It may make sense to emit these as ULEB sequences instead of 116 // pairs of uint64_t. 117 OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n"; 118 OS << "// transition implies a set of NFA transitions. These are referred\n"; 119 OS << "// to by index in " << Name << "Transitions[].\n"; 120 121 SequenceToOffsetTable<DfaTransitionInfo> Table; 122 std::map<DfaTransitionInfo, unsigned> EmittedIndices; 123 for (auto &T : DfaTransitions) 124 Table.add(T.second.second); 125 Table.layout(); 126 OS << "const std::array<NfaStatePair, " << Table.size() << "> " << Name 127 << "TransitionInfo = {{\n"; 128 Table.emit( 129 OS, 130 [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) { 131 OS << "{" << P.first << ", " << P.second << "}"; 132 }, 133 "{0ULL, 0ULL}"); 134 135 OS << "}};\n\n"; 136 137 OS << "// A transition in the generated " << Name << " DFA.\n"; 138 OS << "struct " << Name << "Transition {\n"; 139 OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n"; 140 OS << " "; 141 printActionType(OS); 142 OS << " Action; // The input symbol that causes this transition.\n"; 143 OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n"; 144 OS << " unsigned InfoIdx; // Start index into " << Name 145 << "TransitionInfo.\n"; 146 OS << "};\n\n"; 147 148 OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n"; 149 OS << "// The initial state is 1, not zero.\n"; 150 OS << "const std::array<" << Name << "Transition, " 151 << DfaTransitions.size() << "> " << Name << "Transitions = {{\n"; 152 for (auto &KV : DfaTransitions) { 153 dfa_state_type From = KV.first.first; 154 dfa_state_type To = KV.second.first; 155 action_type A = KV.first.second; 156 unsigned InfoIdx = Table.get(KV.second.second); 157 OS << " {" << From << ", "; 158 printActionValue(A, OS); 159 OS << ", " << To << ", " << InfoIdx << "},\n"; 160 } 161 OS << "\n}};\n\n"; 162 } 163 164 void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; } 165 166 void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; } 167 168 //===----------------------------------------------------------------------===// 169 // AutomatonEmitter implementation 170 //===----------------------------------------------------------------------===// 171 172 namespace { 173 174 using Action = std::variant<Record *, unsigned, std::string>; 175 using ActionTuple = std::vector<Action>; 176 class Automaton; 177 178 class Transition { 179 uint64_t NewState; 180 // The tuple of actions that causes this transition. 181 ActionTuple Actions; 182 // The types of the actions; this is the same across all transitions. 183 SmallVector<std::string, 4> Types; 184 185 public: 186 Transition(Record *R, Automaton *Parent); 187 const ActionTuple &getActions() { return Actions; } 188 SmallVector<std::string, 4> getTypes() { return Types; } 189 190 bool canTransitionFrom(uint64_t State); 191 uint64_t transitionFrom(uint64_t State); 192 }; 193 194 class Automaton { 195 RecordKeeper &Records; 196 Record *R; 197 std::vector<Transition> Transitions; 198 /// All possible action tuples, uniqued. 199 UniqueVector<ActionTuple> Actions; 200 /// The fields within each Transition object to find the action symbols. 201 std::vector<StringRef> ActionSymbolFields; 202 203 public: 204 Automaton(RecordKeeper &Records, Record *R); 205 void emit(raw_ostream &OS); 206 207 ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; } 208 /// If the type of action A has been overridden (there exists a field 209 /// "TypeOf_A") return that, otherwise return the empty string. 210 StringRef getActionSymbolType(StringRef A); 211 }; 212 213 class AutomatonEmitter { 214 RecordKeeper &Records; 215 216 public: 217 AutomatonEmitter(RecordKeeper &R) : Records(R) {} 218 void run(raw_ostream &OS); 219 }; 220 221 /// A DfaEmitter implementation that can print our variant action type. 222 class CustomDfaEmitter : public DfaEmitter { 223 const UniqueVector<ActionTuple> &Actions; 224 std::string TypeName; 225 226 public: 227 CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName) 228 : Actions(Actions), TypeName(TypeName) {} 229 230 void printActionType(raw_ostream &OS) override; 231 void printActionValue(action_type A, raw_ostream &OS) override; 232 }; 233 } // namespace 234 235 void AutomatonEmitter::run(raw_ostream &OS) { 236 for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) { 237 Automaton A(Records, R); 238 OS << "#ifdef GET_" << R->getName() << "_DECL\n"; 239 A.emit(OS); 240 OS << "#endif // GET_" << R->getName() << "_DECL\n"; 241 } 242 } 243 244 Automaton::Automaton(RecordKeeper &Records, Record *R) 245 : Records(Records), R(R) { 246 LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n"); 247 ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields"); 248 } 249 250 void Automaton::emit(raw_ostream &OS) { 251 StringRef TransitionClass = R->getValueAsString("TransitionClass"); 252 for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) { 253 assert(T->isSubClassOf("Transition")); 254 Transitions.emplace_back(T, this); 255 Actions.insert(Transitions.back().getActions()); 256 } 257 258 LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size() 259 << "\n"); 260 LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size() 261 << " potential transitions.\n"); 262 263 StringRef Name = R->getName(); 264 265 CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action"); 266 // Starting from the initial state, build up a list of possible states and 267 // transitions. 268 std::deque<uint64_t> Worklist(1, 0); 269 std::set<uint64_t> SeenStates; 270 unsigned NumTransitions = 0; 271 SeenStates.insert(Worklist.front()); 272 while (!Worklist.empty()) { 273 uint64_t State = Worklist.front(); 274 Worklist.pop_front(); 275 for (Transition &T : Transitions) { 276 if (!T.canTransitionFrom(State)) 277 continue; 278 uint64_t NewState = T.transitionFrom(State); 279 if (SeenStates.emplace(NewState).second) 280 Worklist.emplace_back(NewState); 281 ++NumTransitions; 282 Emitter.addTransition(State, NewState, Actions.idFor(T.getActions())); 283 } 284 } 285 LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size() 286 << " states with " << NumTransitions << " transitions.\n"); 287 (void) NumTransitions; 288 289 const auto &ActionTypes = Transitions.back().getTypes(); 290 OS << "// The type of an action in the " << Name << " automaton.\n"; 291 if (ActionTypes.size() == 1) { 292 OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n"; 293 } else { 294 OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ") 295 << ">;\n"; 296 } 297 OS << "\n"; 298 299 Emitter.emit(Name, OS); 300 } 301 302 StringRef Automaton::getActionSymbolType(StringRef A) { 303 Twine Ty = "TypeOf_" + A; 304 if (!R->getValue(Ty.str())) 305 return ""; 306 return R->getValueAsString(Ty.str()); 307 } 308 309 Transition::Transition(Record *R, Automaton *Parent) { 310 BitsInit *NewStateInit = R->getValueAsBitsInit("NewState"); 311 NewState = 0; 312 assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 && 313 "State cannot be represented in 64 bits!"); 314 for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) { 315 if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) { 316 if (Bit->getValue()) 317 NewState |= 1ULL << I; 318 } 319 } 320 321 for (StringRef A : Parent->getActionSymbolFields()) { 322 RecordVal *SymbolV = R->getValue(A); 323 if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) { 324 Actions.emplace_back(R->getValueAsDef(A)); 325 Types.emplace_back(Ty->getAsString()); 326 } else if (isa<IntRecTy>(SymbolV->getType())) { 327 Actions.emplace_back(static_cast<unsigned>(R->getValueAsInt(A))); 328 Types.emplace_back("unsigned"); 329 } else if (isa<StringRecTy>(SymbolV->getType())) { 330 Actions.emplace_back(std::string(R->getValueAsString(A))); 331 Types.emplace_back("std::string"); 332 } else { 333 report_fatal_error("Unhandled symbol type!"); 334 } 335 336 StringRef TypeOverride = Parent->getActionSymbolType(A); 337 if (!TypeOverride.empty()) 338 Types.back() = std::string(TypeOverride); 339 } 340 } 341 342 bool Transition::canTransitionFrom(uint64_t State) { 343 if ((State & NewState) == 0) 344 // The bits we want to set are not set; 345 return true; 346 return false; 347 } 348 349 uint64_t Transition::transitionFrom(uint64_t State) { 350 return State | NewState; 351 } 352 353 void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; } 354 355 void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) { 356 const ActionTuple &AT = Actions[A]; 357 if (AT.size() > 1) 358 OS << "std::make_tuple("; 359 ListSeparator LS; 360 for (const auto &SingleAction : AT) { 361 OS << LS; 362 if (const auto *R = std::get_if<Record *>(&SingleAction)) 363 OS << (*R)->getName(); 364 else if (const auto *S = std::get_if<std::string>(&SingleAction)) 365 OS << '"' << *S << '"'; 366 else 367 OS << std::get<unsigned>(SingleAction); 368 } 369 if (AT.size() > 1) 370 OS << ")"; 371 } 372 373 static TableGen::Emitter::OptClass<AutomatonEmitter> 374 X("gen-automata", "Generate generic automata"); 375