1 //===- ScriptLexer.cpp ----------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines a lexer for the linker script. 10 // 11 // The linker script's grammar is not complex but ambiguous due to the 12 // lack of the formal specification of the language. What we are trying to 13 // do in this and other files in LLD is to make a "reasonable" linker 14 // script processor. 15 // 16 // Among simplicity, compatibility and efficiency, we put the most 17 // emphasis on simplicity when we wrote this lexer. Compatibility with the 18 // GNU linkers is important, but we did not try to clone every tiny corner 19 // case of their lexers, as even ld.bfd and ld.gold are subtly different 20 // in various corner cases. We do not care much about efficiency because 21 // the time spent in parsing linker scripts is usually negligible. 22 // 23 // Overall, this lexer works fine for most linker scripts. There might 24 // be room for improving compatibility, but that's probably not at the 25 // top of our todo list. 26 // 27 //===----------------------------------------------------------------------===// 28 29 #include "ScriptLexer.h" 30 #include "Config.h" 31 #include "llvm/ADT/Twine.h" 32 #include "llvm/Support/ErrorHandling.h" 33 #include "llvm/Support/FileSystem.h" 34 #include "llvm/Support/Path.h" 35 36 using namespace llvm; 37 using namespace lld; 38 using namespace lld::elf; 39 40 ScriptLexer::Buffer::Buffer(Ctx &ctx, MemoryBufferRef mb) 41 : s(mb.getBuffer()), filename(mb.getBufferIdentifier()), 42 begin(mb.getBufferStart()) { 43 if (ctx.arg.sysroot == "") 44 return; 45 StringRef path = filename; 46 for (; !path.empty(); path = sys::path::parent_path(path)) { 47 if (!sys::fs::equivalent(ctx.arg.sysroot, path)) 48 continue; 49 isUnderSysroot = true; 50 return; 51 } 52 } 53 54 ScriptLexer::ScriptLexer(Ctx &ctx, MemoryBufferRef mb) 55 : ctx(ctx), curBuf(ctx, mb), mbs(1, mb) { 56 activeFilenames.insert(mb.getBufferIdentifier()); 57 } 58 59 // Returns a whole line containing the current token. 60 StringRef ScriptLexer::getLine() { 61 StringRef s = getCurrentMB().getBuffer(); 62 63 size_t pos = s.rfind('\n', prevTok.data() - s.data()); 64 if (pos != StringRef::npos) 65 s = s.substr(pos + 1); 66 return s.substr(0, s.find_first_of("\r\n")); 67 } 68 69 // Returns 0-based column number of the current token. 70 size_t ScriptLexer::getColumnNumber() { 71 return prevTok.data() - getLine().data(); 72 } 73 74 std::string ScriptLexer::getCurrentLocation() { 75 std::string filename = std::string(getCurrentMB().getBufferIdentifier()); 76 return (filename + ":" + Twine(prevTokLine)).str(); 77 } 78 79 // We don't want to record cascading errors. Keep only the first one. 80 void ScriptLexer::setError(const Twine &msg) { 81 if (errCount(ctx)) 82 return; 83 84 std::string s = (getCurrentLocation() + ": " + msg).str(); 85 if (prevTok.size()) 86 s += "\n>>> " + getLine().str() + "\n>>> " + 87 std::string(getColumnNumber(), ' ') + "^"; 88 ErrAlways(ctx) << s; 89 } 90 91 void ScriptLexer::lex() { 92 for (;;) { 93 StringRef &s = curBuf.s; 94 s = skipSpace(s); 95 if (s.empty()) { 96 // If this buffer is from an INCLUDE command, switch to the "return 97 // value"; otherwise, mark EOF. 98 if (buffers.empty()) { 99 eof = true; 100 return; 101 } 102 activeFilenames.erase(curBuf.filename); 103 curBuf = buffers.pop_back_val(); 104 continue; 105 } 106 curTokState = lexState; 107 108 // Quoted token. Note that double-quote characters are parts of a token 109 // because, in a glob match context, only unquoted tokens are interpreted 110 // as glob patterns. Double-quoted tokens are literal patterns in that 111 // context. 112 if (s.starts_with("\"")) { 113 size_t e = s.find("\"", 1); 114 if (e == StringRef::npos) { 115 size_t lineno = 116 StringRef(curBuf.begin, s.data() - curBuf.begin).count('\n'); 117 ErrAlways(ctx) << curBuf.filename << ":" << (lineno + 1) 118 << ": unclosed quote"; 119 return; 120 } 121 122 curTok = s.take_front(e + 1); 123 s = s.substr(e + 1); 124 return; 125 } 126 127 // Some operators form separate tokens. 128 if (s.starts_with("<<=") || s.starts_with(">>=")) { 129 curTok = s.substr(0, 3); 130 s = s.substr(3); 131 return; 132 } 133 if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) { 134 curTok = s.substr(0, 2); 135 s = s.substr(2); 136 return; 137 } 138 139 // Unquoted token. The non-expression token is more relaxed than tokens in 140 // C-like languages, so that you can write "file-name.cpp" as one bare 141 // token. 142 size_t pos; 143 switch (lexState) { 144 case State::Script: 145 pos = s.find_first_not_of( 146 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 147 "0123456789_.$/\\~=+[]*?-!^:"); 148 break; 149 case State::Expr: 150 pos = s.find_first_not_of( 151 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 152 "0123456789_.$"); 153 if (pos == 0 && s.size() >= 2 && 154 ((s[0] == s[1] && strchr("<>&|", s[0])) || 155 is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2)))) 156 pos = 2; 157 break; 158 } 159 160 if (pos == 0) 161 pos = 1; 162 curTok = s.substr(0, pos); 163 s = s.substr(pos); 164 break; 165 } 166 } 167 168 // Skip leading whitespace characters or comments. 169 StringRef ScriptLexer::skipSpace(StringRef s) { 170 for (;;) { 171 if (s.starts_with("/*")) { 172 size_t e = s.find("*/", 2); 173 if (e == StringRef::npos) { 174 setError("unclosed comment in a linker script"); 175 return ""; 176 } 177 curBuf.lineNumber += s.substr(0, e).count('\n'); 178 s = s.substr(e + 2); 179 continue; 180 } 181 if (s.starts_with("#")) { 182 size_t e = s.find('\n', 1); 183 if (e == StringRef::npos) 184 e = s.size() - 1; 185 else 186 ++curBuf.lineNumber; 187 s = s.substr(e + 1); 188 continue; 189 } 190 StringRef saved = s; 191 s = s.ltrim(); 192 auto len = saved.size() - s.size(); 193 if (len == 0) 194 return s; 195 curBuf.lineNumber += saved.substr(0, len).count('\n'); 196 } 197 } 198 199 // Used to determine whether to stop parsing. Treat errors like EOF. 200 bool ScriptLexer::atEOF() { return eof || errCount(ctx); } 201 202 StringRef ScriptLexer::next() { 203 prevTok = peek(); 204 // `prevTokLine` is not updated for EOF so that the line number in `setError` 205 // will be more useful. 206 if (prevTok.size()) 207 prevTokLine = curBuf.lineNumber; 208 return std::exchange(curTok, StringRef(curBuf.s.data(), 0)); 209 } 210 211 StringRef ScriptLexer::peek() { 212 // curTok is invalid if curTokState and lexState mismatch. 213 if (curTok.size() && curTokState != lexState) { 214 curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data()); 215 curTok = {}; 216 } 217 if (curTok.empty()) 218 lex(); 219 return curTok; 220 } 221 222 bool ScriptLexer::consume(StringRef tok) { 223 if (peek() != tok) 224 return false; 225 next(); 226 return true; 227 } 228 229 void ScriptLexer::skip() { (void)next(); } 230 231 void ScriptLexer::expect(StringRef expect) { 232 if (errCount(ctx)) 233 return; 234 StringRef tok = next(); 235 if (tok != expect) { 236 if (atEOF()) 237 setError("unexpected EOF"); 238 else 239 setError(expect + " expected, but got " + tok); 240 } 241 } 242 243 ScriptLexer::Token ScriptLexer::till(StringRef tok) { 244 StringRef str = next(); 245 if (str == tok) 246 return {}; 247 if (!atEOF()) 248 return {str}; 249 prevTok = {}; 250 setError("unexpected EOF"); 251 return {}; 252 } 253 254 // Returns true if S encloses T. 255 static bool encloses(StringRef s, StringRef t) { 256 return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end(); 257 } 258 259 MemoryBufferRef ScriptLexer::getCurrentMB() { 260 // Find input buffer containing the current token. 261 assert(!mbs.empty()); 262 for (MemoryBufferRef mb : mbs) 263 if (encloses(mb.getBuffer(), curBuf.s)) 264 return mb; 265 llvm_unreachable("getCurrentMB: failed to find a token"); 266 } 267