1*0b57cec5SDimitry Andric //===-- Support/DJB.cpp ---DJB Hash -----------------------------*- C++ -*-===// 2*0b57cec5SDimitry Andric // 3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*0b57cec5SDimitry Andric // 7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 8*0b57cec5SDimitry Andric // 9*0b57cec5SDimitry Andric // This file contains support for the DJ Bernstein hash function. 10*0b57cec5SDimitry Andric // 11*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 12*0b57cec5SDimitry Andric 13*0b57cec5SDimitry Andric #include "llvm/Support/DJB.h" 14*0b57cec5SDimitry Andric #include "llvm/ADT/ArrayRef.h" 15*0b57cec5SDimitry Andric #include "llvm/Support/Compiler.h" 16*0b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h" 17*0b57cec5SDimitry Andric #include "llvm/Support/Unicode.h" 18*0b57cec5SDimitry Andric 19*0b57cec5SDimitry Andric using namespace llvm; 20*0b57cec5SDimitry Andric 21*0b57cec5SDimitry Andric static UTF32 chopOneUTF32(StringRef &Buffer) { 22*0b57cec5SDimitry Andric UTF32 C; 23*0b57cec5SDimitry Andric const UTF8 *const Begin8Const = 24*0b57cec5SDimitry Andric reinterpret_cast<const UTF8 *>(Buffer.begin()); 25*0b57cec5SDimitry Andric const UTF8 *Begin8 = Begin8Const; 26*0b57cec5SDimitry Andric UTF32 *Begin32 = &C; 27*0b57cec5SDimitry Andric 28*0b57cec5SDimitry Andric // In lenient mode we will always end up with a "reasonable" value in C for 29*0b57cec5SDimitry Andric // non-empty input. 30*0b57cec5SDimitry Andric assert(!Buffer.empty()); 31*0b57cec5SDimitry Andric ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()), 32*0b57cec5SDimitry Andric &Begin32, &C + 1, lenientConversion); 33*0b57cec5SDimitry Andric Buffer = Buffer.drop_front(Begin8 - Begin8Const); 34*0b57cec5SDimitry Andric return C; 35*0b57cec5SDimitry Andric } 36*0b57cec5SDimitry Andric 37*0b57cec5SDimitry Andric static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) { 38*0b57cec5SDimitry Andric const UTF32 *Begin32 = &C; 39*0b57cec5SDimitry Andric UTF8 *Begin8 = Storage.begin(); 40*0b57cec5SDimitry Andric 41*0b57cec5SDimitry Andric // The case-folded output should always be a valid unicode character, so use 42*0b57cec5SDimitry Andric // strict mode here. 43*0b57cec5SDimitry Andric ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8, 44*0b57cec5SDimitry Andric Storage.end(), strictConversion); 45*0b57cec5SDimitry Andric assert(CR == conversionOK && "Case folding produced invalid char?"); 46*0b57cec5SDimitry Andric (void)CR; 47*0b57cec5SDimitry Andric return StringRef(reinterpret_cast<char *>(Storage.begin()), 48*0b57cec5SDimitry Andric Begin8 - Storage.begin()); 49*0b57cec5SDimitry Andric } 50*0b57cec5SDimitry Andric 51*0b57cec5SDimitry Andric static UTF32 foldCharDwarf(UTF32 C) { 52*0b57cec5SDimitry Andric // DWARF v5 addition to the unicode folding rules. 53*0b57cec5SDimitry Andric // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot 54*0b57cec5SDimitry Andric // Above" into "i". 55*0b57cec5SDimitry Andric if (C == 0x130 || C == 0x131) 56*0b57cec5SDimitry Andric return 'i'; 57*0b57cec5SDimitry Andric return sys::unicode::foldCharSimple(C); 58*0b57cec5SDimitry Andric } 59*0b57cec5SDimitry Andric 60*0b57cec5SDimitry Andric static Optional<uint32_t> fastCaseFoldingDjbHash(StringRef Buffer, uint32_t H) { 61*0b57cec5SDimitry Andric bool AllASCII = true; 62*0b57cec5SDimitry Andric for (unsigned char C : Buffer) { 63*0b57cec5SDimitry Andric H = H * 33 + ('A' <= C && C <= 'Z' ? C - 'A' + 'a' : C); 64*0b57cec5SDimitry Andric AllASCII &= C <= 0x7f; 65*0b57cec5SDimitry Andric } 66*0b57cec5SDimitry Andric if (AllASCII) 67*0b57cec5SDimitry Andric return H; 68*0b57cec5SDimitry Andric return None; 69*0b57cec5SDimitry Andric } 70*0b57cec5SDimitry Andric 71*0b57cec5SDimitry Andric uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) { 72*0b57cec5SDimitry Andric if (Optional<uint32_t> Result = fastCaseFoldingDjbHash(Buffer, H)) 73*0b57cec5SDimitry Andric return *Result; 74*0b57cec5SDimitry Andric 75*0b57cec5SDimitry Andric std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage; 76*0b57cec5SDimitry Andric while (!Buffer.empty()) { 77*0b57cec5SDimitry Andric UTF32 C = foldCharDwarf(chopOneUTF32(Buffer)); 78*0b57cec5SDimitry Andric StringRef Folded = toUTF8(C, Storage); 79*0b57cec5SDimitry Andric H = djbHash(Folded, H); 80*0b57cec5SDimitry Andric } 81*0b57cec5SDimitry Andric return H; 82*0b57cec5SDimitry Andric } 83