xref: /MusicPlayer2/scintilla/lexlib/WordList.cxx (revision 8af74909132ed5e696cb05b6689ae4baf14c1c96)
1*8af74909SZhong Yang // Scintilla source code edit control
2*8af74909SZhong Yang /** @file WordList.cxx
3*8af74909SZhong Yang  ** Hold a list of words.
4*8af74909SZhong Yang  **/
5*8af74909SZhong Yang // Copyright 1998-2002 by Neil Hodgson <[email protected]>
6*8af74909SZhong Yang // The License.txt file describes the conditions under which this software may be distributed.
7*8af74909SZhong Yang 
8*8af74909SZhong Yang #include <cstdlib>
9*8af74909SZhong Yang #include <cassert>
10*8af74909SZhong Yang #include <cstring>
11*8af74909SZhong Yang 
12*8af74909SZhong Yang #include <algorithm>
13*8af74909SZhong Yang #include <iterator>
14*8af74909SZhong Yang 
15*8af74909SZhong Yang #include "WordList.h"
16*8af74909SZhong Yang 
17*8af74909SZhong Yang using namespace Scintilla;
18*8af74909SZhong Yang 
19*8af74909SZhong Yang /**
20*8af74909SZhong Yang  * Creates an array that points into each word in the string and puts \0 terminators
21*8af74909SZhong Yang  * after each word.
22*8af74909SZhong Yang  */
ArrayFromWordList(char * wordlist,size_t slen,int * len,bool onlyLineEnds=false)23*8af74909SZhong Yang static char **ArrayFromWordList(char *wordlist, size_t slen, int *len, bool onlyLineEnds = false) {
24*8af74909SZhong Yang 	int prev = '\n';
25*8af74909SZhong Yang 	int words = 0;
26*8af74909SZhong Yang 	// For rapid determination of whether a character is a separator, build
27*8af74909SZhong Yang 	// a look up table.
28*8af74909SZhong Yang 	bool wordSeparator[256] = {};	// Initialise all to false.
29*8af74909SZhong Yang 	wordSeparator[static_cast<unsigned int>('\r')] = true;
30*8af74909SZhong Yang 	wordSeparator[static_cast<unsigned int>('\n')] = true;
31*8af74909SZhong Yang 	if (!onlyLineEnds) {
32*8af74909SZhong Yang 		wordSeparator[static_cast<unsigned int>(' ')] = true;
33*8af74909SZhong Yang 		wordSeparator[static_cast<unsigned int>('\t')] = true;
34*8af74909SZhong Yang 	}
35*8af74909SZhong Yang 	for (int j = 0; wordlist[j]; j++) {
36*8af74909SZhong Yang 		const int curr = static_cast<unsigned char>(wordlist[j]);
37*8af74909SZhong Yang 		if (!wordSeparator[curr] && wordSeparator[prev])
38*8af74909SZhong Yang 			words++;
39*8af74909SZhong Yang 		prev = curr;
40*8af74909SZhong Yang 	}
41*8af74909SZhong Yang 	char **keywords = new char *[words + 1];
42*8af74909SZhong Yang 	int wordsStore = 0;
43*8af74909SZhong Yang 	if (words) {
44*8af74909SZhong Yang 		prev = '\0';
45*8af74909SZhong Yang 		for (size_t k = 0; k < slen; k++) {
46*8af74909SZhong Yang 			if (!wordSeparator[static_cast<unsigned char>(wordlist[k])]) {
47*8af74909SZhong Yang 				if (!prev) {
48*8af74909SZhong Yang 					keywords[wordsStore] = &wordlist[k];
49*8af74909SZhong Yang 					wordsStore++;
50*8af74909SZhong Yang 				}
51*8af74909SZhong Yang 			} else {
52*8af74909SZhong Yang 				wordlist[k] = '\0';
53*8af74909SZhong Yang 			}
54*8af74909SZhong Yang 			prev = wordlist[k];
55*8af74909SZhong Yang 		}
56*8af74909SZhong Yang 	}
57*8af74909SZhong Yang 	assert(wordsStore < (words + 1));
58*8af74909SZhong Yang 	keywords[wordsStore] = &wordlist[slen];
59*8af74909SZhong Yang 	*len = wordsStore;
60*8af74909SZhong Yang 	return keywords;
61*8af74909SZhong Yang }
62*8af74909SZhong Yang 
WordList(bool onlyLineEnds_)63*8af74909SZhong Yang WordList::WordList(bool onlyLineEnds_) :
64*8af74909SZhong Yang 	words(0), list(0), len(0), onlyLineEnds(onlyLineEnds_) {
65*8af74909SZhong Yang 	// Prevent warnings by static analyzers about uninitialized starts.
66*8af74909SZhong Yang 	starts[0] = -1;
67*8af74909SZhong Yang }
68*8af74909SZhong Yang 
~WordList()69*8af74909SZhong Yang WordList::~WordList() {
70*8af74909SZhong Yang 	Clear();
71*8af74909SZhong Yang }
72*8af74909SZhong Yang 
operator bool() const73*8af74909SZhong Yang WordList::operator bool() const noexcept {
74*8af74909SZhong Yang 	return len ? true : false;
75*8af74909SZhong Yang }
76*8af74909SZhong Yang 
operator !=(const WordList & other) const77*8af74909SZhong Yang bool WordList::operator!=(const WordList &other) const noexcept {
78*8af74909SZhong Yang 	if (len != other.len)
79*8af74909SZhong Yang 		return true;
80*8af74909SZhong Yang 	for (int i=0; i<len; i++) {
81*8af74909SZhong Yang 		if (strcmp(words[i], other.words[i]) != 0)
82*8af74909SZhong Yang 			return true;
83*8af74909SZhong Yang 	}
84*8af74909SZhong Yang 	return false;
85*8af74909SZhong Yang }
86*8af74909SZhong Yang 
Length() const87*8af74909SZhong Yang int WordList::Length() const noexcept {
88*8af74909SZhong Yang 	return len;
89*8af74909SZhong Yang }
90*8af74909SZhong Yang 
Clear()91*8af74909SZhong Yang void WordList::Clear() noexcept {
92*8af74909SZhong Yang 	if (words) {
93*8af74909SZhong Yang 		delete []list;
94*8af74909SZhong Yang 		delete []words;
95*8af74909SZhong Yang 	}
96*8af74909SZhong Yang 	words = nullptr;
97*8af74909SZhong Yang 	list = nullptr;
98*8af74909SZhong Yang 	len = 0;
99*8af74909SZhong Yang }
100*8af74909SZhong Yang 
101*8af74909SZhong Yang #ifdef _MSC_VER
102*8af74909SZhong Yang 
cmpWords(const char * a,const char * b)103*8af74909SZhong Yang static bool cmpWords(const char *a, const char *b) {
104*8af74909SZhong Yang 	return strcmp(a, b) < 0;
105*8af74909SZhong Yang }
106*8af74909SZhong Yang 
107*8af74909SZhong Yang #else
108*8af74909SZhong Yang 
cmpWords(const void * a,const void * b)109*8af74909SZhong Yang static int cmpWords(const void *a, const void *b) {
110*8af74909SZhong Yang 	return strcmp(*static_cast<const char * const *>(a), *static_cast<const char * const *>(b));
111*8af74909SZhong Yang }
112*8af74909SZhong Yang 
SortWordList(char ** words,unsigned int len)113*8af74909SZhong Yang static void SortWordList(char **words, unsigned int len) {
114*8af74909SZhong Yang 	qsort(words, len, sizeof(*words), cmpWords);
115*8af74909SZhong Yang }
116*8af74909SZhong Yang 
117*8af74909SZhong Yang #endif
118*8af74909SZhong Yang 
Set(const char * s)119*8af74909SZhong Yang bool WordList::Set(const char *s) {
120*8af74909SZhong Yang 	const size_t lenS = strlen(s) + 1;
121*8af74909SZhong Yang 	char *listTemp = new char[lenS];
122*8af74909SZhong Yang 	memcpy(listTemp, s, lenS);
123*8af74909SZhong Yang 	int lenTemp = 0;
124*8af74909SZhong Yang 	char **wordsTemp = ArrayFromWordList(listTemp, lenS - 1, &lenTemp, onlyLineEnds);
125*8af74909SZhong Yang #ifdef _MSC_VER
126*8af74909SZhong Yang 	std::sort(wordsTemp, wordsTemp + lenTemp, cmpWords);
127*8af74909SZhong Yang #else
128*8af74909SZhong Yang 	SortWordList(wordsTemp, lenTemp);
129*8af74909SZhong Yang #endif
130*8af74909SZhong Yang 
131*8af74909SZhong Yang 	if (lenTemp == len) {
132*8af74909SZhong Yang 		bool changed = false;
133*8af74909SZhong Yang 		for (int i = 0; i < lenTemp; i++) {
134*8af74909SZhong Yang 			if (strcmp(words[i], wordsTemp[i]) != 0) {
135*8af74909SZhong Yang 				changed = true;
136*8af74909SZhong Yang 				break;
137*8af74909SZhong Yang 			}
138*8af74909SZhong Yang 		}
139*8af74909SZhong Yang 		if (!changed) {
140*8af74909SZhong Yang 			delete []listTemp;
141*8af74909SZhong Yang 			delete []wordsTemp;
142*8af74909SZhong Yang 			return false;
143*8af74909SZhong Yang 		}
144*8af74909SZhong Yang 	}
145*8af74909SZhong Yang 
146*8af74909SZhong Yang 	Clear();
147*8af74909SZhong Yang 	words = wordsTemp;
148*8af74909SZhong Yang 	list = listTemp;
149*8af74909SZhong Yang 	len = lenTemp;
150*8af74909SZhong Yang 	std::fill(starts, std::end(starts), -1);
151*8af74909SZhong Yang 	for (int l = len - 1; l >= 0; l--) {
152*8af74909SZhong Yang 		unsigned char indexChar = words[l][0];
153*8af74909SZhong Yang 		starts[indexChar] = l;
154*8af74909SZhong Yang 	}
155*8af74909SZhong Yang 	return true;
156*8af74909SZhong Yang }
157*8af74909SZhong Yang 
158*8af74909SZhong Yang /** Check whether a string is in the list.
159*8af74909SZhong Yang  * List elements are either exact matches or prefixes.
160*8af74909SZhong Yang  * Prefix elements start with '^' and match all strings that start with the rest of the element
161*8af74909SZhong Yang  * so '^GTK_' matches 'GTK_X', 'GTK_MAJOR_VERSION', and 'GTK_'.
162*8af74909SZhong Yang  */
InList(const char * s) const163*8af74909SZhong Yang bool WordList::InList(const char *s) const noexcept {
164*8af74909SZhong Yang 	if (0 == words)
165*8af74909SZhong Yang 		return false;
166*8af74909SZhong Yang 	const unsigned char firstChar = s[0];
167*8af74909SZhong Yang 	int j = starts[firstChar];
168*8af74909SZhong Yang 	if (j >= 0) {
169*8af74909SZhong Yang 		while (words[j][0] == firstChar) {
170*8af74909SZhong Yang 			if (s[1] == words[j][1]) {
171*8af74909SZhong Yang 				const char *a = words[j] + 1;
172*8af74909SZhong Yang 				const char *b = s + 1;
173*8af74909SZhong Yang 				while (*a && *a == *b) {
174*8af74909SZhong Yang 					a++;
175*8af74909SZhong Yang 					b++;
176*8af74909SZhong Yang 				}
177*8af74909SZhong Yang 				if (!*a && !*b)
178*8af74909SZhong Yang 					return true;
179*8af74909SZhong Yang 			}
180*8af74909SZhong Yang 			j++;
181*8af74909SZhong Yang 		}
182*8af74909SZhong Yang 	}
183*8af74909SZhong Yang 	j = starts[static_cast<unsigned int>('^')];
184*8af74909SZhong Yang 	if (j >= 0) {
185*8af74909SZhong Yang 		while (words[j][0] == '^') {
186*8af74909SZhong Yang 			const char *a = words[j] + 1;
187*8af74909SZhong Yang 			const char *b = s;
188*8af74909SZhong Yang 			while (*a && *a == *b) {
189*8af74909SZhong Yang 				a++;
190*8af74909SZhong Yang 				b++;
191*8af74909SZhong Yang 			}
192*8af74909SZhong Yang 			if (!*a)
193*8af74909SZhong Yang 				return true;
194*8af74909SZhong Yang 			j++;
195*8af74909SZhong Yang 		}
196*8af74909SZhong Yang 	}
197*8af74909SZhong Yang 	return false;
198*8af74909SZhong Yang }
199*8af74909SZhong Yang 
200*8af74909SZhong Yang /** similar to InList, but word s can be a substring of keyword.
201*8af74909SZhong Yang  * eg. the keyword define is defined as def~ine. This means the word must start
202*8af74909SZhong Yang  * with def to be a keyword, but also defi, defin and define are valid.
203*8af74909SZhong Yang  * The marker is ~ in this case.
204*8af74909SZhong Yang  */
InListAbbreviated(const char * s,const char marker) const205*8af74909SZhong Yang bool WordList::InListAbbreviated(const char *s, const char marker) const noexcept {
206*8af74909SZhong Yang 	if (0 == words)
207*8af74909SZhong Yang 		return false;
208*8af74909SZhong Yang 	const unsigned char firstChar = s[0];
209*8af74909SZhong Yang 	int j = starts[firstChar];
210*8af74909SZhong Yang 	if (j >= 0) {
211*8af74909SZhong Yang 		while (words[j][0] == firstChar) {
212*8af74909SZhong Yang 			bool isSubword = false;
213*8af74909SZhong Yang 			int start = 1;
214*8af74909SZhong Yang 			if (words[j][1] == marker) {
215*8af74909SZhong Yang 				isSubword = true;
216*8af74909SZhong Yang 				start++;
217*8af74909SZhong Yang 			}
218*8af74909SZhong Yang 			if (s[1] == words[j][start]) {
219*8af74909SZhong Yang 				const char *a = words[j] + start;
220*8af74909SZhong Yang 				const char *b = s + 1;
221*8af74909SZhong Yang 				while (*a && *a == *b) {
222*8af74909SZhong Yang 					a++;
223*8af74909SZhong Yang 					if (*a == marker) {
224*8af74909SZhong Yang 						isSubword = true;
225*8af74909SZhong Yang 						a++;
226*8af74909SZhong Yang 					}
227*8af74909SZhong Yang 					b++;
228*8af74909SZhong Yang 				}
229*8af74909SZhong Yang 				if ((!*a || isSubword) && !*b)
230*8af74909SZhong Yang 					return true;
231*8af74909SZhong Yang 			}
232*8af74909SZhong Yang 			j++;
233*8af74909SZhong Yang 		}
234*8af74909SZhong Yang 	}
235*8af74909SZhong Yang 	j = starts[static_cast<unsigned int>('^')];
236*8af74909SZhong Yang 	if (j >= 0) {
237*8af74909SZhong Yang 		while (words[j][0] == '^') {
238*8af74909SZhong Yang 			const char *a = words[j] + 1;
239*8af74909SZhong Yang 			const char *b = s;
240*8af74909SZhong Yang 			while (*a && *a == *b) {
241*8af74909SZhong Yang 				a++;
242*8af74909SZhong Yang 				b++;
243*8af74909SZhong Yang 			}
244*8af74909SZhong Yang 			if (!*a)
245*8af74909SZhong Yang 				return true;
246*8af74909SZhong Yang 			j++;
247*8af74909SZhong Yang 		}
248*8af74909SZhong Yang 	}
249*8af74909SZhong Yang 	return false;
250*8af74909SZhong Yang }
251*8af74909SZhong Yang 
252*8af74909SZhong Yang /** similar to InListAbbreviated, but word s can be a abridged version of a keyword.
253*8af74909SZhong Yang * eg. the keyword is defined as "after.~:". This means the word must have a prefix (begins with) of
254*8af74909SZhong Yang * "after." and suffix (ends with) of ":" to be a keyword, Hence "after.field:" , "after.form.item:" are valid.
255*8af74909SZhong Yang * Similarly "~.is.valid" keyword is suffix only... hence "field.is.valid" , "form.is.valid" are valid.
256*8af74909SZhong Yang * The marker is ~ in this case.
257*8af74909SZhong Yang * No multiple markers check is done and wont work.
258*8af74909SZhong Yang */
InListAbridged(const char * s,const char marker) const259*8af74909SZhong Yang bool WordList::InListAbridged(const char *s, const char marker) const noexcept {
260*8af74909SZhong Yang 	if (0 == words)
261*8af74909SZhong Yang 		return false;
262*8af74909SZhong Yang 	const unsigned char firstChar = s[0];
263*8af74909SZhong Yang 	int j = starts[firstChar];
264*8af74909SZhong Yang 	if (j >= 0) {
265*8af74909SZhong Yang 		while (words[j][0] == firstChar) {
266*8af74909SZhong Yang 			const char *a = words[j];
267*8af74909SZhong Yang 			const char *b = s;
268*8af74909SZhong Yang 			while (*a && *a == *b) {
269*8af74909SZhong Yang 				a++;
270*8af74909SZhong Yang 				if (*a == marker) {
271*8af74909SZhong Yang 					a++;
272*8af74909SZhong Yang 					const size_t suffixLengthA = strlen(a);
273*8af74909SZhong Yang 					const size_t suffixLengthB = strlen(b);
274*8af74909SZhong Yang 					if (suffixLengthA >= suffixLengthB)
275*8af74909SZhong Yang 						break;
276*8af74909SZhong Yang 					b = b + suffixLengthB - suffixLengthA - 1;
277*8af74909SZhong Yang 				}
278*8af74909SZhong Yang 				b++;
279*8af74909SZhong Yang 			}
280*8af74909SZhong Yang 			if (!*a  && !*b)
281*8af74909SZhong Yang 				return true;
282*8af74909SZhong Yang 			j++;
283*8af74909SZhong Yang 		}
284*8af74909SZhong Yang 	}
285*8af74909SZhong Yang 
286*8af74909SZhong Yang 	j = starts[static_cast<unsigned int>(marker)];
287*8af74909SZhong Yang 	if (j >= 0) {
288*8af74909SZhong Yang 		while (words[j][0] == marker) {
289*8af74909SZhong Yang 			const char *a = words[j] + 1;
290*8af74909SZhong Yang 			const char *b = s;
291*8af74909SZhong Yang 			const size_t suffixLengthA = strlen(a);
292*8af74909SZhong Yang 			const size_t suffixLengthB = strlen(b);
293*8af74909SZhong Yang 			if (suffixLengthA > suffixLengthB) {
294*8af74909SZhong Yang 				j++;
295*8af74909SZhong Yang 				continue;
296*8af74909SZhong Yang 			}
297*8af74909SZhong Yang 			b = b + suffixLengthB - suffixLengthA;
298*8af74909SZhong Yang 
299*8af74909SZhong Yang 			while (*a && *a == *b) {
300*8af74909SZhong Yang 				a++;
301*8af74909SZhong Yang 				b++;
302*8af74909SZhong Yang 			}
303*8af74909SZhong Yang 			if (!*a && !*b)
304*8af74909SZhong Yang 				return true;
305*8af74909SZhong Yang 			j++;
306*8af74909SZhong Yang 		}
307*8af74909SZhong Yang 	}
308*8af74909SZhong Yang 
309*8af74909SZhong Yang 	return false;
310*8af74909SZhong Yang }
311*8af74909SZhong Yang 
WordAt(int n) const312*8af74909SZhong Yang const char *WordList::WordAt(int n) const noexcept {
313*8af74909SZhong Yang 	return words[n];
314*8af74909SZhong Yang }
315*8af74909SZhong Yang 
316