1 // Scintilla source code edit control 2 /** @file WordList.cxx 3 ** Hold a list of words. 4 **/ 5 // Copyright 1998-2002 by Neil Hodgson <[email protected]> 6 // The License.txt file describes the conditions under which this software may be distributed. 7 8 #include <cstdlib> 9 #include <cassert> 10 #include <cstring> 11 12 #include <algorithm> 13 #include <iterator> 14 15 #include "WordList.h" 16 17 using namespace Scintilla; 18 19 /** 20 * Creates an array that points into each word in the string and puts \0 terminators 21 * after each word. 22 */ 23 static char **ArrayFromWordList(char *wordlist, size_t slen, int *len, bool onlyLineEnds = false) { 24 int prev = '\n'; 25 int words = 0; 26 // For rapid determination of whether a character is a separator, build 27 // a look up table. 28 bool wordSeparator[256] = {}; // Initialise all to false. 29 wordSeparator[static_cast<unsigned int>('\r')] = true; 30 wordSeparator[static_cast<unsigned int>('\n')] = true; 31 if (!onlyLineEnds) { 32 wordSeparator[static_cast<unsigned int>(' ')] = true; 33 wordSeparator[static_cast<unsigned int>('\t')] = true; 34 } 35 for (int j = 0; wordlist[j]; j++) { 36 const int curr = static_cast<unsigned char>(wordlist[j]); 37 if (!wordSeparator[curr] && wordSeparator[prev]) 38 words++; 39 prev = curr; 40 } 41 char **keywords = new char *[words + 1]; 42 int wordsStore = 0; 43 if (words) { 44 prev = '\0'; 45 for (size_t k = 0; k < slen; k++) { 46 if (!wordSeparator[static_cast<unsigned char>(wordlist[k])]) { 47 if (!prev) { 48 keywords[wordsStore] = &wordlist[k]; 49 wordsStore++; 50 } 51 } else { 52 wordlist[k] = '\0'; 53 } 54 prev = wordlist[k]; 55 } 56 } 57 assert(wordsStore < (words + 1)); 58 keywords[wordsStore] = &wordlist[slen]; 59 *len = wordsStore; 60 return keywords; 61 } 62 63 WordList::WordList(bool onlyLineEnds_) : 64 words(0), list(0), len(0), onlyLineEnds(onlyLineEnds_) { 65 // Prevent warnings by static analyzers about uninitialized starts. 66 starts[0] = -1; 67 } 68 69 WordList::~WordList() { 70 Clear(); 71 } 72 73 WordList::operator bool() const noexcept { 74 return len ? true : false; 75 } 76 77 bool WordList::operator!=(const WordList &other) const noexcept { 78 if (len != other.len) 79 return true; 80 for (int i=0; i<len; i++) { 81 if (strcmp(words[i], other.words[i]) != 0) 82 return true; 83 } 84 return false; 85 } 86 87 int WordList::Length() const noexcept { 88 return len; 89 } 90 91 void WordList::Clear() noexcept { 92 if (words) { 93 delete []list; 94 delete []words; 95 } 96 words = nullptr; 97 list = nullptr; 98 len = 0; 99 } 100 101 #ifdef _MSC_VER 102 103 static bool cmpWords(const char *a, const char *b) { 104 return strcmp(a, b) < 0; 105 } 106 107 #else 108 109 static int cmpWords(const void *a, const void *b) { 110 return strcmp(*static_cast<const char * const *>(a), *static_cast<const char * const *>(b)); 111 } 112 113 static void SortWordList(char **words, unsigned int len) { 114 qsort(words, len, sizeof(*words), cmpWords); 115 } 116 117 #endif 118 119 bool WordList::Set(const char *s) { 120 const size_t lenS = strlen(s) + 1; 121 char *listTemp = new char[lenS]; 122 memcpy(listTemp, s, lenS); 123 int lenTemp = 0; 124 char **wordsTemp = ArrayFromWordList(listTemp, lenS - 1, &lenTemp, onlyLineEnds); 125 #ifdef _MSC_VER 126 std::sort(wordsTemp, wordsTemp + lenTemp, cmpWords); 127 #else 128 SortWordList(wordsTemp, lenTemp); 129 #endif 130 131 if (lenTemp == len) { 132 bool changed = false; 133 for (int i = 0; i < lenTemp; i++) { 134 if (strcmp(words[i], wordsTemp[i]) != 0) { 135 changed = true; 136 break; 137 } 138 } 139 if (!changed) { 140 delete []listTemp; 141 delete []wordsTemp; 142 return false; 143 } 144 } 145 146 Clear(); 147 words = wordsTemp; 148 list = listTemp; 149 len = lenTemp; 150 std::fill(starts, std::end(starts), -1); 151 for (int l = len - 1; l >= 0; l--) { 152 unsigned char indexChar = words[l][0]; 153 starts[indexChar] = l; 154 } 155 return true; 156 } 157 158 /** Check whether a string is in the list. 159 * List elements are either exact matches or prefixes. 160 * Prefix elements start with '^' and match all strings that start with the rest of the element 161 * so '^GTK_' matches 'GTK_X', 'GTK_MAJOR_VERSION', and 'GTK_'. 162 */ 163 bool WordList::InList(const char *s) const noexcept { 164 if (0 == words) 165 return false; 166 const unsigned char firstChar = s[0]; 167 int j = starts[firstChar]; 168 if (j >= 0) { 169 while (words[j][0] == firstChar) { 170 if (s[1] == words[j][1]) { 171 const char *a = words[j] + 1; 172 const char *b = s + 1; 173 while (*a && *a == *b) { 174 a++; 175 b++; 176 } 177 if (!*a && !*b) 178 return true; 179 } 180 j++; 181 } 182 } 183 j = starts[static_cast<unsigned int>('^')]; 184 if (j >= 0) { 185 while (words[j][0] == '^') { 186 const char *a = words[j] + 1; 187 const char *b = s; 188 while (*a && *a == *b) { 189 a++; 190 b++; 191 } 192 if (!*a) 193 return true; 194 j++; 195 } 196 } 197 return false; 198 } 199 200 /** similar to InList, but word s can be a substring of keyword. 201 * eg. the keyword define is defined as def~ine. This means the word must start 202 * with def to be a keyword, but also defi, defin and define are valid. 203 * The marker is ~ in this case. 204 */ 205 bool WordList::InListAbbreviated(const char *s, const char marker) const noexcept { 206 if (0 == words) 207 return false; 208 const unsigned char firstChar = s[0]; 209 int j = starts[firstChar]; 210 if (j >= 0) { 211 while (words[j][0] == firstChar) { 212 bool isSubword = false; 213 int start = 1; 214 if (words[j][1] == marker) { 215 isSubword = true; 216 start++; 217 } 218 if (s[1] == words[j][start]) { 219 const char *a = words[j] + start; 220 const char *b = s + 1; 221 while (*a && *a == *b) { 222 a++; 223 if (*a == marker) { 224 isSubword = true; 225 a++; 226 } 227 b++; 228 } 229 if ((!*a || isSubword) && !*b) 230 return true; 231 } 232 j++; 233 } 234 } 235 j = starts[static_cast<unsigned int>('^')]; 236 if (j >= 0) { 237 while (words[j][0] == '^') { 238 const char *a = words[j] + 1; 239 const char *b = s; 240 while (*a && *a == *b) { 241 a++; 242 b++; 243 } 244 if (!*a) 245 return true; 246 j++; 247 } 248 } 249 return false; 250 } 251 252 /** similar to InListAbbreviated, but word s can be a abridged version of a keyword. 253 * eg. the keyword is defined as "after.~:". This means the word must have a prefix (begins with) of 254 * "after." and suffix (ends with) of ":" to be a keyword, Hence "after.field:" , "after.form.item:" are valid. 255 * Similarly "~.is.valid" keyword is suffix only... hence "field.is.valid" , "form.is.valid" are valid. 256 * The marker is ~ in this case. 257 * No multiple markers check is done and wont work. 258 */ 259 bool WordList::InListAbridged(const char *s, const char marker) const noexcept { 260 if (0 == words) 261 return false; 262 const unsigned char firstChar = s[0]; 263 int j = starts[firstChar]; 264 if (j >= 0) { 265 while (words[j][0] == firstChar) { 266 const char *a = words[j]; 267 const char *b = s; 268 while (*a && *a == *b) { 269 a++; 270 if (*a == marker) { 271 a++; 272 const size_t suffixLengthA = strlen(a); 273 const size_t suffixLengthB = strlen(b); 274 if (suffixLengthA >= suffixLengthB) 275 break; 276 b = b + suffixLengthB - suffixLengthA - 1; 277 } 278 b++; 279 } 280 if (!*a && !*b) 281 return true; 282 j++; 283 } 284 } 285 286 j = starts[static_cast<unsigned int>(marker)]; 287 if (j >= 0) { 288 while (words[j][0] == marker) { 289 const char *a = words[j] + 1; 290 const char *b = s; 291 const size_t suffixLengthA = strlen(a); 292 const size_t suffixLengthB = strlen(b); 293 if (suffixLengthA > suffixLengthB) { 294 j++; 295 continue; 296 } 297 b = b + suffixLengthB - suffixLengthA; 298 299 while (*a && *a == *b) { 300 a++; 301 b++; 302 } 303 if (!*a && !*b) 304 return true; 305 j++; 306 } 307 } 308 309 return false; 310 } 311 312 const char *WordList::WordAt(int n) const noexcept { 313 return words[n]; 314 } 315 316