1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11 /* Implementation notes:
12 *
13 * This is a very simple lorem ipsum generator
14 * which features a static list of words
15 * and print them one after another randomly
16 * with a fake sentence / paragraph structure.
17 *
18 * The goal is to generate a printable text
19 * that can be used to fake a text compression scenario.
20 * The resulting compression / ratio curve of the lorem ipsum generator
21 * is more satisfying than the previous statistical generator,
22 * which was initially designed for entropy compression,
23 * and lacks a regularity more representative of text.
24 *
25 * The compression ratio achievable on the generated lorem ipsum
26 * is still a bit too good, presumably because the dictionary is a bit too
27 * small. It would be possible to create some more complex scheme, notably by
28 * enlarging the dictionary with a word generator, and adding grammatical rules
29 * (composition) and syntax rules. But that's probably overkill for the intended
30 * goal.
31 */
32
33 #include "lorem.h"
34 #include <assert.h>
35 #include <limits.h> /* INT_MAX */
36 #include <string.h> /* memcpy */
37
38 #define WORD_MAX_SIZE 20
39
40 /* Define the word pool */
41 static const char* kWords[] = {
42 "lorem", "ipsum", "dolor", "sit", "amet",
43 "consectetur", "adipiscing", "elit", "sed", "do",
44 "eiusmod", "tempor", "incididunt", "ut", "labore",
45 "et", "dolore", "magna", "aliqua", "dis",
46 "lectus", "vestibulum", "mattis", "ullamcorper", "velit",
47 "commodo", "a", "lacus", "arcu", "magnis",
48 "parturient", "montes", "nascetur", "ridiculus", "mus",
49 "mauris", "nulla", "malesuada", "pellentesque", "eget",
50 "gravida", "in", "dictum", "non", "erat",
51 "nam", "voluptat", "maecenas", "blandit", "aliquam",
52 "etiam", "enim", "lobortis", "scelerisque", "fermentum",
53 "dui", "faucibus", "ornare", "at", "elementum",
54 "eu", "facilisis", "odio", "morbi", "quis",
55 "eros", "donec", "ac", "orci", "purus",
56 "turpis", "cursus", "leo", "vel", "porta",
57 "consequat", "interdum", "varius", "vulputate", "aliquet",
58 "pharetra", "nunc", "auctor", "urna", "id",
59 "metus", "viverra", "nibh", "cras", "mi",
60 "unde", "omnis", "iste", "natus", "error",
61 "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium",
62 "totam", "rem", "aperiam", "eaque", "ipsa",
63 "quae", "ab", "illo", "inventore", "veritatis",
64 "quasi", "architecto", "beatae", "vitae", "dicta",
65 "sunt", "explicabo", "nemo", "ipsam", "quia",
66 "voluptas", "aspernatur", "aut", "odit", "fugit",
67 "consequuntur", "magni", "dolores", "eos", "qui",
68 "ratione", "sequi", "nesciunt", "neque", "porro",
69 "quisquam", "est", "dolorem", "adipisci", "numquam",
70 "eius", "modi", "tempora", "incidunt", "magnam",
71 "quaerat", "ad", "minima", "veniam", "nostrum",
72 "ullam", "corporis", "suscipit", "laboriosam", "nisi",
73 "aliquid", "ex", "ea", "commodi", "consequatur",
74 "autem", "eum", "iure", "voluptate", "esse",
75 "quam", "nihil", "molestiae", "illum", "fugiat",
76 "quo", "pariatur", "vero", "accusamus", "iusto",
77 "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum",
78 "deleniti", "atque", "corrupti", "quos", "quas",
79 "molestias", "excepturi", "sint", "occaecati", "cupiditate",
80 "provident", "similique", "culpa", "officia", "deserunt",
81 "mollitia", "animi", "laborum", "dolorum", "fuga",
82 "harum", "quidem", "rerum", "facilis", "expedita",
83 "distinctio", "libero", "tempore", "cum", "soluta",
84 "nobis", "eligendi", "optio", "cumque", "impedit",
85 "minus", "quod", "maxime", "placeat", "facere",
86 "possimus", "assumenda", "repellendus", "temporibus", "quibusdam",
87 "officiis", "debitis", "saepe", "eveniet", "voluptates",
88 "repudiandae", "recusandae", "itaque", "earum", "hic",
89 "tenetur", "sapiente", "delectus", "reiciendis", "cillum",
90 "maiores", "alias", "perferendis", "doloribus", "asperiores",
91 "repellat", "minim", "nostrud", "exercitation", "ullamco",
92 "laboris", "aliquip", "duis", "aute", "irure",
93 };
94 static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
95
96 /* simple 1-dimension distribution, based on word's length, favors small words
97 */
98 static const int kWeights[] = { 0, 8, 6, 4, 3, 2 };
99 static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
100
101 #define DISTRIB_SIZE_MAX 650
102 static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
103 static unsigned g_distribCount = 0;
104
countFreqs(const char * words[],size_t nbWords,const int * weights,size_t nbWeights)105 static void countFreqs(
106 const char* words[],
107 size_t nbWords,
108 const int* weights,
109 size_t nbWeights)
110 {
111 unsigned total = 0;
112 size_t w;
113 for (w = 0; w < nbWords; w++) {
114 size_t len = strlen(words[w]);
115 int lmax;
116 if (len >= nbWeights)
117 len = nbWeights - 1;
118 lmax = weights[len];
119 total += (unsigned)lmax;
120 }
121 g_distribCount = total;
122 assert(g_distribCount <= DISTRIB_SIZE_MAX);
123 }
124
init_word_distrib(const char * words[],size_t nbWords,const int * weights,size_t nbWeights)125 static void init_word_distrib(
126 const char* words[],
127 size_t nbWords,
128 const int* weights,
129 size_t nbWeights)
130 {
131 size_t w, d = 0;
132 countFreqs(words, nbWords, weights, nbWeights);
133 for (w = 0; w < nbWords; w++) {
134 size_t len = strlen(words[w]);
135 int l, lmax;
136 if (len >= nbWeights)
137 len = nbWeights - 1;
138 lmax = weights[len];
139 for (l = 0; l < lmax; l++) {
140 g_distrib[d++] = (int)w;
141 }
142 }
143 }
144
145 /* Note: this unit only works when invoked sequentially.
146 * No concurrent access is allowed */
147 static char* g_ptr = NULL;
148 static size_t g_nbChars = 0;
149 static size_t g_maxChars = 10000000;
150 static unsigned g_randRoot = 0;
151
152 #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
LOREM_rand(unsigned range)153 static unsigned LOREM_rand(unsigned range)
154 {
155 static const unsigned prime1 = 2654435761U;
156 static const unsigned prime2 = 2246822519U;
157 unsigned rand32 = g_randRoot;
158 rand32 *= prime1;
159 rand32 ^= prime2;
160 rand32 = RDG_rotl32(rand32, 13);
161 g_randRoot = rand32;
162 return (unsigned)(((unsigned long long)rand32 * range) >> 32);
163 }
164
writeLastCharacters(void)165 static void writeLastCharacters(void)
166 {
167 size_t lastChars = g_maxChars - g_nbChars;
168 assert(g_maxChars >= g_nbChars);
169 if (lastChars == 0)
170 return;
171 g_ptr[g_nbChars++] = '.';
172 if (lastChars > 2) {
173 memset(g_ptr + g_nbChars, ' ', lastChars - 2);
174 }
175 if (lastChars > 1) {
176 g_ptr[g_maxChars - 1] = '\n';
177 }
178 g_nbChars = g_maxChars;
179 }
180
generateWord(const char * word,const char * separator,int upCase)181 static void generateWord(const char* word, const char* separator, int upCase)
182 {
183 size_t const len = strlen(word) + strlen(separator);
184 if (g_nbChars + len > g_maxChars) {
185 writeLastCharacters();
186 return;
187 }
188 memcpy(g_ptr + g_nbChars, word, strlen(word));
189 if (upCase) {
190 static const char toUp = 'A' - 'a';
191 g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp);
192 }
193 g_nbChars += strlen(word);
194 memcpy(g_ptr + g_nbChars, separator, strlen(separator));
195 g_nbChars += strlen(separator);
196 }
197
about(unsigned target)198 static int about(unsigned target)
199 {
200 return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
201 }
202
203 /* Function to generate a random sentence */
generateSentence(int nbWords)204 static void generateSentence(int nbWords)
205 {
206 int commaPos = about(9);
207 int comma2 = commaPos + about(7);
208 int qmark = (LOREM_rand(11) == 7);
209 const char* endSep = qmark ? "? " : ". ";
210 int i;
211 for (i = 0; i < nbWords; i++) {
212 int const wordID = g_distrib[LOREM_rand(g_distribCount)];
213 const char* const word = kWords[wordID];
214 const char* sep = " ";
215 if (i == commaPos)
216 sep = ", ";
217 if (i == comma2)
218 sep = ", ";
219 if (i == nbWords - 1)
220 sep = endSep;
221 generateWord(word, sep, i == 0);
222 }
223 }
224
generateParagraph(int nbSentences)225 static void generateParagraph(int nbSentences)
226 {
227 int i;
228 for (i = 0; i < nbSentences; i++) {
229 int wordsPerSentence = about(11);
230 generateSentence(wordsPerSentence);
231 }
232 if (g_nbChars < g_maxChars) {
233 g_ptr[g_nbChars++] = '\n';
234 }
235 if (g_nbChars < g_maxChars) {
236 g_ptr[g_nbChars++] = '\n';
237 }
238 }
239
240 /* It's "common" for lorem ipsum generators to start with the same first
241 * pre-defined sentence */
generateFirstSentence(void)242 static void generateFirstSentence(void)
243 {
244 int i;
245 for (i = 0; i < 18; i++) {
246 const char* word = kWords[i];
247 const char* separator = " ";
248 if (i == 4)
249 separator = ", ";
250 if (i == 7)
251 separator = ", ";
252 generateWord(word, separator, i == 0);
253 }
254 generateWord(kWords[18], ". ", 0);
255 }
256
257 size_t
LOREM_genBlock(void * buffer,size_t size,unsigned seed,int first,int fill)258 LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
259 {
260 g_ptr = (char*)buffer;
261 assert(size < INT_MAX);
262 g_maxChars = size;
263 g_nbChars = 0;
264 g_randRoot = seed;
265 if (g_distribCount == 0) {
266 init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
267 }
268
269 if (first) {
270 generateFirstSentence();
271 }
272 while (g_nbChars < g_maxChars) {
273 int sentencePerParagraph = about(7);
274 generateParagraph(sentencePerParagraph);
275 if (!fill)
276 break; /* only generate one paragraph in not-fill mode */
277 }
278 g_ptr = NULL;
279 return g_nbChars;
280 }
281
LOREM_genBuffer(void * buffer,size_t size,unsigned seed)282 void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
283 {
284 LOREM_genBlock(buffer, size, seed, 1, 1);
285 }
286