1 // © 2020 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // uniquecharstr.h 5 // created: 2020sep01 Frank Yung-Fong Tang 6 7 #ifndef __UNIQUECHARSTR_H__ 8 #define __UNIQUECHARSTR_H__ 9 10 #include "charstr.h" 11 #include "uassert.h" 12 #include "uhash.h" 13 #include "cmemory.h" 14 15 U_NAMESPACE_BEGIN 16 17 /** 18 * Stores NUL-terminated strings with duplicate elimination. 19 * Checks for unique UTF-16 string pointers and converts to invariant characters. 20 * 21 * Intended to be stack-allocated. Add strings, get a unique number for each, 22 * freeze the object, get a char * pointer for each string, 23 * call orphanCharStrings() to capture the string storage, and let this object go out of scope. 24 */ 25 class UniqueCharStrings { 26 public: UniqueCharStrings(UErrorCode & errorCode)27 UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) { 28 // Note: We hash on string contents but store stable char16_t * pointers. 29 // If the strings are stored in resource bundles which should be built with 30 // duplicate elimination, then we should be able to hash on just the pointer values. 31 uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode); 32 if (U_FAILURE(errorCode)) { return; } 33 strings = new CharString(); 34 if (strings == nullptr) { 35 errorCode = U_MEMORY_ALLOCATION_ERROR; 36 } 37 } ~UniqueCharStrings()38 ~UniqueCharStrings() { 39 uhash_close(&map); 40 delete strings; 41 } 42 43 /** Returns/orphans the CharString that contains all strings. */ orphanCharStrings()44 CharString *orphanCharStrings() { 45 CharString *result = strings; 46 strings = nullptr; 47 return result; 48 } 49 50 /** 51 * Adds a NUL-terminated string and returns a unique number for it. 52 * The string must not change, nor move around in memory, 53 * while this UniqueCharStrings is in use. 54 * 55 * Best used with string data in a stable storage, such as strings returned 56 * by resource bundle functions. 57 */ add(const char16_t * p,UErrorCode & errorCode)58 int32_t add(const char16_t*p, UErrorCode &errorCode) { 59 if (U_FAILURE(errorCode)) { return -1; } 60 if (isFrozen) { 61 errorCode = U_NO_WRITE_PERMISSION; 62 return -1; 63 } 64 // The string points into the resource bundle. 65 int32_t oldIndex = uhash_geti(&map, p); 66 if (oldIndex != 0) { // found duplicate 67 return oldIndex; 68 } 69 // Explicit NUL terminator for the previous string. 70 // The strings object is also terminated with one implicit NUL. 71 strings->append(0, errorCode); 72 int32_t newIndex = strings->length(); 73 strings->appendInvariantChars(p, u_strlen(p), errorCode); 74 uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode); 75 return newIndex; 76 } 77 78 /** 79 * Adds a unicode string by value and returns a unique number for it. 80 */ addByValue(UnicodeString s,UErrorCode & errorCode)81 int32_t addByValue(UnicodeString s, UErrorCode &errorCode) { 82 if (U_FAILURE(errorCode)) { return -1; } 83 if (isFrozen) { 84 errorCode = U_NO_WRITE_PERMISSION; 85 return -1; 86 } 87 int32_t oldIndex = uhash_geti(&map, s.getTerminatedBuffer()); 88 if (oldIndex != 0) { // found duplicate 89 return oldIndex; 90 } 91 // We need to store the string content of the UnicodeString. 92 UnicodeString *key = keyStore.create(s); 93 if (key == nullptr) { 94 errorCode = U_MEMORY_ALLOCATION_ERROR; 95 return -1; 96 } 97 return add(key->getTerminatedBuffer(), errorCode); 98 } 99 freeze()100 void freeze() { isFrozen = true; } 101 102 /** 103 * Returns a string pointer for its unique number, if this object is frozen. 104 * Otherwise nullptr. 105 */ get(int32_t i)106 const char *get(int32_t i) const { 107 U_ASSERT(isFrozen); 108 return isFrozen && i > 0 ? strings->data() + i : nullptr; 109 } 110 111 private: 112 UHashtable map; 113 CharString *strings; 114 MemoryPool<UnicodeString> keyStore; 115 bool isFrozen = false; 116 }; 117 118 U_NAMESPACE_END 119 120 #endif // __UNIQUECHARSTR_H__ 121