1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) International Business Machines Corp., 2000-2004
4 * Portions Copyright (C) Tino Reichardt, 2012
5 */
6
7 #include <linux/fs.h>
8 #include <linux/slab.h>
9 #include "jfs_incore.h"
10 #include "jfs_superblock.h"
11 #include "jfs_dmap.h"
12 #include "jfs_imap.h"
13 #include "jfs_lock.h"
14 #include "jfs_metapage.h"
15 #include "jfs_debug.h"
16 #include "jfs_discard.h"
17
18 /*
19 * SERIALIZATION of the Block Allocation Map.
20 *
21 * the working state of the block allocation map is accessed in
22 * two directions:
23 *
24 * 1) allocation and free requests that start at the dmap
25 * level and move up through the dmap control pages (i.e.
26 * the vast majority of requests).
27 *
28 * 2) allocation requests that start at dmap control page
29 * level and work down towards the dmaps.
30 *
31 * the serialization scheme used here is as follows.
32 *
33 * requests which start at the bottom are serialized against each
34 * other through buffers and each requests holds onto its buffers
35 * as it works it way up from a single dmap to the required level
36 * of dmap control page.
37 * requests that start at the top are serialized against each other
38 * and request that start from the bottom by the multiple read/single
39 * write inode lock of the bmap inode. requests starting at the top
40 * take this lock in write mode while request starting at the bottom
41 * take the lock in read mode. a single top-down request may proceed
42 * exclusively while multiple bottoms-up requests may proceed
43 * simultaneously (under the protection of busy buffers).
44 *
45 * in addition to information found in dmaps and dmap control pages,
46 * the working state of the block allocation map also includes read/
47 * write information maintained in the bmap descriptor (i.e. total
48 * free block count, allocation group level free block counts).
49 * a single exclusive lock (BMAP_LOCK) is used to guard this information
50 * in the face of multiple-bottoms up requests.
51 * (lock ordering: IREAD_LOCK, BMAP_LOCK);
52 *
53 * accesses to the persistent state of the block allocation map (limited
54 * to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
55 */
56
57 #define BMAP_LOCK_INIT(bmp) mutex_init(&bmp->db_bmaplock)
58 #define BMAP_LOCK(bmp) mutex_lock(&bmp->db_bmaplock)
59 #define BMAP_UNLOCK(bmp) mutex_unlock(&bmp->db_bmaplock)
60
61 /*
62 * forward references
63 */
64 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
65 int nblocks);
66 static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl);
67 static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl);
68 static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl);
69 static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl);
70 static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
71 int level);
72 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
73 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
74 int nblocks);
75 static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
76 int nblocks,
77 int l2nb, s64 * results);
78 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
79 int nblocks);
80 static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
81 int l2nb,
82 s64 * results);
83 static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
84 s64 * results);
85 static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
86 s64 * results);
87 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
88 static int dbFindBits(u32 word, int l2nb);
89 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
90 static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl);
91 static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
92 int nblocks);
93 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
94 int nblocks);
95 static int dbMaxBud(u8 * cp);
96 static int blkstol2(s64 nb);
97
98 static int cntlz(u32 value);
99 static int cnttz(u32 word);
100
101 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
102 int nblocks);
103 static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
104 static int dbInitDmapTree(struct dmap * dp);
105 static int dbInitTree(struct dmaptree * dtp);
106 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
107 static int dbGetL2AGSize(s64 nblocks);
108
109 /*
110 * buddy table
111 *
112 * table used for determining buddy sizes within characters of
113 * dmap bitmap words. the characters themselves serve as indexes
114 * into the table, with the table elements yielding the maximum
115 * binary buddy of free bits within the character.
116 */
117 static const s8 budtab[256] = {
118 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
120 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
121 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
123 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
124 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
125 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
126 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
128 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
129 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
130 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
132 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
133 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
134 };
135
136 /*
137 * NAME: dbMount()
138 *
139 * FUNCTION: initializate the block allocation map.
140 *
141 * memory is allocated for the in-core bmap descriptor and
142 * the in-core descriptor is initialized from disk.
143 *
144 * PARAMETERS:
145 * ipbmap - pointer to in-core inode for the block map.
146 *
147 * RETURN VALUES:
148 * 0 - success
149 * -ENOMEM - insufficient memory
150 * -EIO - i/o error
151 * -EINVAL - wrong bmap data
152 */
dbMount(struct inode * ipbmap)153 int dbMount(struct inode *ipbmap)
154 {
155 struct bmap *bmp;
156 struct dbmap_disk *dbmp_le;
157 struct metapage *mp;
158 int i, err;
159
160 /*
161 * allocate/initialize the in-memory bmap descriptor
162 */
163 /* allocate memory for the in-memory bmap descriptor */
164 bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
165 if (bmp == NULL)
166 return -ENOMEM;
167
168 /* read the on-disk bmap descriptor. */
169 mp = read_metapage(ipbmap,
170 BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
171 PSIZE, 0);
172 if (mp == NULL) {
173 err = -EIO;
174 goto err_kfree_bmp;
175 }
176
177 /* copy the on-disk bmap descriptor to its in-memory version. */
178 dbmp_le = (struct dbmap_disk *) mp->data;
179 bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
180 bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
181
182 bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
183 if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE ||
184 bmp->db_l2nbperpage < 0) {
185 err = -EINVAL;
186 goto err_release_metapage;
187 }
188
189 bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
190 if (!bmp->db_numag || bmp->db_numag > MAXAG) {
191 err = -EINVAL;
192 goto err_release_metapage;
193 }
194
195 bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
196 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
197 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
198 if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 ||
199 bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) {
200 err = -EINVAL;
201 goto err_release_metapage;
202 }
203
204 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
205 bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
206 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
207 if (!bmp->db_agwidth) {
208 err = -EINVAL;
209 goto err_release_metapage;
210 }
211 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
212 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
213 if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG ||
214 bmp->db_agl2size < 0) {
215 err = -EINVAL;
216 goto err_release_metapage;
217 }
218
219 if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
220 err = -EINVAL;
221 goto err_release_metapage;
222 }
223
224 for (i = 0; i < MAXAG; i++)
225 bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
226 bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
227 bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
228
229 /* release the buffer. */
230 release_metapage(mp);
231
232 /* bind the bmap inode and the bmap descriptor to each other. */
233 bmp->db_ipbmap = ipbmap;
234 JFS_SBI(ipbmap->i_sb)->bmap = bmp;
235
236 memset(bmp->db_active, 0, sizeof(bmp->db_active));
237
238 /*
239 * allocate/initialize the bmap lock
240 */
241 BMAP_LOCK_INIT(bmp);
242
243 return (0);
244
245 err_release_metapage:
246 release_metapage(mp);
247 err_kfree_bmp:
248 kfree(bmp);
249 return err;
250 }
251
252
253 /*
254 * NAME: dbUnmount()
255 *
256 * FUNCTION: terminate the block allocation map in preparation for
257 * file system unmount.
258 *
259 * the in-core bmap descriptor is written to disk and
260 * the memory for this descriptor is freed.
261 *
262 * PARAMETERS:
263 * ipbmap - pointer to in-core inode for the block map.
264 *
265 * RETURN VALUES:
266 * 0 - success
267 * -EIO - i/o error
268 */
dbUnmount(struct inode * ipbmap,int mounterror)269 int dbUnmount(struct inode *ipbmap, int mounterror)
270 {
271 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
272
273 if (!(mounterror || isReadOnly(ipbmap)))
274 dbSync(ipbmap);
275
276 /*
277 * Invalidate the page cache buffers
278 */
279 truncate_inode_pages(ipbmap->i_mapping, 0);
280
281 /* free the memory for the in-memory bmap. */
282 kfree(bmp);
283 JFS_SBI(ipbmap->i_sb)->bmap = NULL;
284
285 return (0);
286 }
287
288 /*
289 * dbSync()
290 */
dbSync(struct inode * ipbmap)291 int dbSync(struct inode *ipbmap)
292 {
293 struct dbmap_disk *dbmp_le;
294 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
295 struct metapage *mp;
296 int i;
297
298 /*
299 * write bmap global control page
300 */
301 /* get the buffer for the on-disk bmap descriptor. */
302 mp = read_metapage(ipbmap,
303 BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
304 PSIZE, 0);
305 if (mp == NULL) {
306 jfs_err("dbSync: read_metapage failed!");
307 return -EIO;
308 }
309 /* copy the in-memory version of the bmap to the on-disk version */
310 dbmp_le = (struct dbmap_disk *) mp->data;
311 dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
312 dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
313 dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
314 dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
315 dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
316 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
317 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
318 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
319 dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
320 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
321 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
322 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
323 for (i = 0; i < MAXAG; i++)
324 dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
325 dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
326 dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
327
328 /* write the buffer */
329 write_metapage(mp);
330
331 /*
332 * write out dirty pages of bmap
333 */
334 filemap_write_and_wait(ipbmap->i_mapping);
335
336 diWriteSpecial(ipbmap, 0);
337
338 return (0);
339 }
340
341 /*
342 * NAME: dbFree()
343 *
344 * FUNCTION: free the specified block range from the working block
345 * allocation map.
346 *
347 * the blocks will be free from the working map one dmap
348 * at a time.
349 *
350 * PARAMETERS:
351 * ip - pointer to in-core inode;
352 * blkno - starting block number to be freed.
353 * nblocks - number of blocks to be freed.
354 *
355 * RETURN VALUES:
356 * 0 - success
357 * -EIO - i/o error
358 */
dbFree(struct inode * ip,s64 blkno,s64 nblocks)359 int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
360 {
361 struct metapage *mp;
362 struct dmap *dp;
363 int nb, rc;
364 s64 lblkno, rem;
365 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
366 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
367 struct super_block *sb = ipbmap->i_sb;
368
369 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
370
371 /* block to be freed better be within the mapsize. */
372 if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
373 IREAD_UNLOCK(ipbmap);
374 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
375 (unsigned long long) blkno,
376 (unsigned long long) nblocks);
377 jfs_error(ip->i_sb, "block to be freed is outside the map\n");
378 return -EIO;
379 }
380
381 /**
382 * TRIM the blocks, when mounted with discard option
383 */
384 if (JFS_SBI(sb)->flag & JFS_DISCARD)
385 if (JFS_SBI(sb)->minblks_trim <= nblocks)
386 jfs_issue_discard(ipbmap, blkno, nblocks);
387
388 /*
389 * free the blocks a dmap at a time.
390 */
391 mp = NULL;
392 for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
393 /* release previous dmap if any */
394 if (mp) {
395 write_metapage(mp);
396 }
397
398 /* get the buffer for the current dmap. */
399 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
400 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
401 if (mp == NULL) {
402 IREAD_UNLOCK(ipbmap);
403 return -EIO;
404 }
405 dp = (struct dmap *) mp->data;
406
407 /* determine the number of blocks to be freed from
408 * this dmap.
409 */
410 nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
411
412 /* free the blocks. */
413 if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
414 jfs_error(ip->i_sb, "error in block map\n");
415 release_metapage(mp);
416 IREAD_UNLOCK(ipbmap);
417 return (rc);
418 }
419 }
420
421 /* write the last buffer. */
422 if (mp)
423 write_metapage(mp);
424
425 IREAD_UNLOCK(ipbmap);
426
427 return (0);
428 }
429
430
431 /*
432 * NAME: dbUpdatePMap()
433 *
434 * FUNCTION: update the allocation state (free or allocate) of the
435 * specified block range in the persistent block allocation map.
436 *
437 * the blocks will be updated in the persistent map one
438 * dmap at a time.
439 *
440 * PARAMETERS:
441 * ipbmap - pointer to in-core inode for the block map.
442 * free - 'true' if block range is to be freed from the persistent
443 * map; 'false' if it is to be allocated.
444 * blkno - starting block number of the range.
445 * nblocks - number of contiguous blocks in the range.
446 * tblk - transaction block;
447 *
448 * RETURN VALUES:
449 * 0 - success
450 * -EIO - i/o error
451 */
452 int
dbUpdatePMap(struct inode * ipbmap,int free,s64 blkno,s64 nblocks,struct tblock * tblk)453 dbUpdatePMap(struct inode *ipbmap,
454 int free, s64 blkno, s64 nblocks, struct tblock * tblk)
455 {
456 int nblks, dbitno, wbitno, rbits;
457 int word, nbits, nwords;
458 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
459 s64 lblkno, rem, lastlblkno;
460 u32 mask;
461 struct dmap *dp;
462 struct metapage *mp;
463 struct jfs_log *log;
464 int lsn, difft, diffp;
465 unsigned long flags;
466
467 /* the blocks better be within the mapsize. */
468 if (blkno + nblocks > bmp->db_mapsize) {
469 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
470 (unsigned long long) blkno,
471 (unsigned long long) nblocks);
472 jfs_error(ipbmap->i_sb, "blocks are outside the map\n");
473 return -EIO;
474 }
475
476 /* compute delta of transaction lsn from log syncpt */
477 lsn = tblk->lsn;
478 log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
479 logdiff(difft, lsn, log);
480
481 /*
482 * update the block state a dmap at a time.
483 */
484 mp = NULL;
485 lastlblkno = 0;
486 for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
487 /* get the buffer for the current dmap. */
488 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
489 if (lblkno != lastlblkno) {
490 if (mp) {
491 write_metapage(mp);
492 }
493
494 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
495 0);
496 if (mp == NULL)
497 return -EIO;
498 metapage_wait_for_io(mp);
499 }
500 dp = (struct dmap *) mp->data;
501
502 /* determine the bit number and word within the dmap of
503 * the starting block. also determine how many blocks
504 * are to be updated within this dmap.
505 */
506 dbitno = blkno & (BPERDMAP - 1);
507 word = dbitno >> L2DBWORD;
508 nblks = min(rem, (s64)BPERDMAP - dbitno);
509
510 /* update the bits of the dmap words. the first and last
511 * words may only have a subset of their bits updated. if
512 * this is the case, we'll work against that word (i.e.
513 * partial first and/or last) only in a single pass. a
514 * single pass will also be used to update all words that
515 * are to have all their bits updated.
516 */
517 for (rbits = nblks; rbits > 0;
518 rbits -= nbits, dbitno += nbits) {
519 /* determine the bit number within the word and
520 * the number of bits within the word.
521 */
522 wbitno = dbitno & (DBWORD - 1);
523 nbits = min(rbits, DBWORD - wbitno);
524
525 /* check if only part of the word is to be updated. */
526 if (nbits < DBWORD) {
527 /* update (free or allocate) the bits
528 * in this word.
529 */
530 mask =
531 (ONES << (DBWORD - nbits) >> wbitno);
532 if (free)
533 dp->pmap[word] &=
534 cpu_to_le32(~mask);
535 else
536 dp->pmap[word] |=
537 cpu_to_le32(mask);
538
539 word += 1;
540 } else {
541 /* one or more words are to have all
542 * their bits updated. determine how
543 * many words and how many bits.
544 */
545 nwords = rbits >> L2DBWORD;
546 nbits = nwords << L2DBWORD;
547
548 /* update (free or allocate) the bits
549 * in these words.
550 */
551 if (free)
552 memset(&dp->pmap[word], 0,
553 nwords * 4);
554 else
555 memset(&dp->pmap[word], (int) ONES,
556 nwords * 4);
557
558 word += nwords;
559 }
560 }
561
562 /*
563 * update dmap lsn
564 */
565 if (lblkno == lastlblkno)
566 continue;
567
568 lastlblkno = lblkno;
569
570 LOGSYNC_LOCK(log, flags);
571 if (mp->lsn != 0) {
572 /* inherit older/smaller lsn */
573 logdiff(diffp, mp->lsn, log);
574 if (difft < diffp) {
575 mp->lsn = lsn;
576
577 /* move bp after tblock in logsync list */
578 list_move(&mp->synclist, &tblk->synclist);
579 }
580
581 /* inherit younger/larger clsn */
582 logdiff(difft, tblk->clsn, log);
583 logdiff(diffp, mp->clsn, log);
584 if (difft > diffp)
585 mp->clsn = tblk->clsn;
586 } else {
587 mp->log = log;
588 mp->lsn = lsn;
589
590 /* insert bp after tblock in logsync list */
591 log->count++;
592 list_add(&mp->synclist, &tblk->synclist);
593
594 mp->clsn = tblk->clsn;
595 }
596 LOGSYNC_UNLOCK(log, flags);
597 }
598
599 /* write the last buffer. */
600 if (mp) {
601 write_metapage(mp);
602 }
603
604 return (0);
605 }
606
607
608 /*
609 * NAME: dbNextAG()
610 *
611 * FUNCTION: find the preferred allocation group for new allocations.
612 *
613 * Within the allocation groups, we maintain a preferred
614 * allocation group which consists of a group with at least
615 * average free space. It is the preferred group that we target
616 * new inode allocation towards. The tie-in between inode
617 * allocation and block allocation occurs as we allocate the
618 * first (data) block of an inode and specify the inode (block)
619 * as the allocation hint for this block.
620 *
621 * We try to avoid having more than one open file growing in
622 * an allocation group, as this will lead to fragmentation.
623 * This differs from the old OS/2 method of trying to keep
624 * empty ags around for large allocations.
625 *
626 * PARAMETERS:
627 * ipbmap - pointer to in-core inode for the block map.
628 *
629 * RETURN VALUES:
630 * the preferred allocation group number.
631 */
dbNextAG(struct inode * ipbmap)632 int dbNextAG(struct inode *ipbmap)
633 {
634 s64 avgfree;
635 int agpref;
636 s64 hwm = 0;
637 int i;
638 int next_best = -1;
639 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
640
641 BMAP_LOCK(bmp);
642
643 /* determine the average number of free blocks within the ags. */
644 avgfree = (u32)bmp->db_nfree / bmp->db_numag;
645
646 /*
647 * if the current preferred ag does not have an active allocator
648 * and has at least average freespace, return it
649 */
650 agpref = bmp->db_agpref;
651 if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
652 (bmp->db_agfree[agpref] >= avgfree))
653 goto unlock;
654
655 /* From the last preferred ag, find the next one with at least
656 * average free space.
657 */
658 for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
659 if (agpref >= bmp->db_numag)
660 agpref = 0;
661
662 if (atomic_read(&bmp->db_active[agpref]))
663 /* open file is currently growing in this ag */
664 continue;
665 if (bmp->db_agfree[agpref] >= avgfree) {
666 /* Return this one */
667 bmp->db_agpref = agpref;
668 goto unlock;
669 } else if (bmp->db_agfree[agpref] > hwm) {
670 /* Less than avg. freespace, but best so far */
671 hwm = bmp->db_agfree[agpref];
672 next_best = agpref;
673 }
674 }
675
676 /*
677 * If no inactive ag was found with average freespace, use the
678 * next best
679 */
680 if (next_best != -1)
681 bmp->db_agpref = next_best;
682 /* else leave db_agpref unchanged */
683 unlock:
684 BMAP_UNLOCK(bmp);
685
686 /* return the preferred group.
687 */
688 return (bmp->db_agpref);
689 }
690
691 /*
692 * NAME: dbAlloc()
693 *
694 * FUNCTION: attempt to allocate a specified number of contiguous free
695 * blocks from the working allocation block map.
696 *
697 * the block allocation policy uses hints and a multi-step
698 * approach.
699 *
700 * for allocation requests smaller than the number of blocks
701 * per dmap, we first try to allocate the new blocks
702 * immediately following the hint. if these blocks are not
703 * available, we try to allocate blocks near the hint. if
704 * no blocks near the hint are available, we next try to
705 * allocate within the same dmap as contains the hint.
706 *
707 * if no blocks are available in the dmap or the allocation
708 * request is larger than the dmap size, we try to allocate
709 * within the same allocation group as contains the hint. if
710 * this does not succeed, we finally try to allocate anywhere
711 * within the aggregate.
712 *
713 * we also try to allocate anywhere within the aggregate
714 * for allocation requests larger than the allocation group
715 * size or requests that specify no hint value.
716 *
717 * PARAMETERS:
718 * ip - pointer to in-core inode;
719 * hint - allocation hint.
720 * nblocks - number of contiguous blocks in the range.
721 * results - on successful return, set to the starting block number
722 * of the newly allocated contiguous range.
723 *
724 * RETURN VALUES:
725 * 0 - success
726 * -ENOSPC - insufficient disk resources
727 * -EIO - i/o error
728 */
dbAlloc(struct inode * ip,s64 hint,s64 nblocks,s64 * results)729 int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
730 {
731 int rc, agno;
732 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
733 struct bmap *bmp;
734 struct metapage *mp;
735 s64 lblkno, blkno;
736 struct dmap *dp;
737 int l2nb;
738 s64 mapSize;
739 int writers;
740
741 /* assert that nblocks is valid */
742 assert(nblocks > 0);
743
744 /* get the log2 number of blocks to be allocated.
745 * if the number of blocks is not a log2 multiple,
746 * it will be rounded up to the next log2 multiple.
747 */
748 l2nb = BLKSTOL2(nblocks);
749
750 bmp = JFS_SBI(ip->i_sb)->bmap;
751
752 mapSize = bmp->db_mapsize;
753
754 /* the hint should be within the map */
755 if (hint >= mapSize) {
756 jfs_error(ip->i_sb, "the hint is outside the map\n");
757 return -EIO;
758 }
759
760 /* if the number of blocks to be allocated is greater than the
761 * allocation group size, try to allocate anywhere.
762 */
763 if (l2nb > bmp->db_agl2size) {
764 IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
765
766 rc = dbAllocAny(bmp, nblocks, l2nb, results);
767
768 goto write_unlock;
769 }
770
771 /*
772 * If no hint, let dbNextAG recommend an allocation group
773 */
774 if (hint == 0)
775 goto pref_ag;
776
777 /* we would like to allocate close to the hint. adjust the
778 * hint to the block following the hint since the allocators
779 * will start looking for free space starting at this point.
780 */
781 blkno = hint + 1;
782
783 if (blkno >= bmp->db_mapsize)
784 goto pref_ag;
785
786 agno = blkno >> bmp->db_agl2size;
787
788 /* check if blkno crosses over into a new allocation group.
789 * if so, check if we should allow allocations within this
790 * allocation group.
791 */
792 if ((blkno & (bmp->db_agsize - 1)) == 0)
793 /* check if the AG is currently being written to.
794 * if so, call dbNextAG() to find a non-busy
795 * AG with sufficient free space.
796 */
797 if (atomic_read(&bmp->db_active[agno]))
798 goto pref_ag;
799
800 /* check if the allocation request size can be satisfied from a
801 * single dmap. if so, try to allocate from the dmap containing
802 * the hint using a tiered strategy.
803 */
804 if (nblocks <= BPERDMAP) {
805 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
806
807 /* get the buffer for the dmap containing the hint.
808 */
809 rc = -EIO;
810 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
811 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
812 if (mp == NULL)
813 goto read_unlock;
814
815 dp = (struct dmap *) mp->data;
816
817 /* first, try to satisfy the allocation request with the
818 * blocks beginning at the hint.
819 */
820 if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
821 != -ENOSPC) {
822 if (rc == 0) {
823 *results = blkno;
824 mark_metapage_dirty(mp);
825 }
826
827 release_metapage(mp);
828 goto read_unlock;
829 }
830
831 writers = atomic_read(&bmp->db_active[agno]);
832 if ((writers > 1) ||
833 ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
834 /*
835 * Someone else is writing in this allocation
836 * group. To avoid fragmenting, try another ag
837 */
838 release_metapage(mp);
839 IREAD_UNLOCK(ipbmap);
840 goto pref_ag;
841 }
842
843 /* next, try to satisfy the allocation request with blocks
844 * near the hint.
845 */
846 if ((rc =
847 dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
848 != -ENOSPC) {
849 if (rc == 0)
850 mark_metapage_dirty(mp);
851
852 release_metapage(mp);
853 goto read_unlock;
854 }
855
856 /* try to satisfy the allocation request with blocks within
857 * the same dmap as the hint.
858 */
859 if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
860 != -ENOSPC) {
861 if (rc == 0)
862 mark_metapage_dirty(mp);
863
864 release_metapage(mp);
865 goto read_unlock;
866 }
867
868 release_metapage(mp);
869 IREAD_UNLOCK(ipbmap);
870 }
871
872 /* try to satisfy the allocation request with blocks within
873 * the same allocation group as the hint.
874 */
875 IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
876 if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC)
877 goto write_unlock;
878
879 IWRITE_UNLOCK(ipbmap);
880
881
882 pref_ag:
883 /*
884 * Let dbNextAG recommend a preferred allocation group
885 */
886 agno = dbNextAG(ipbmap);
887 IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
888
889 /* Try to allocate within this allocation group. if that fails, try to
890 * allocate anywhere in the map.
891 */
892 if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC)
893 rc = dbAllocAny(bmp, nblocks, l2nb, results);
894
895 write_unlock:
896 IWRITE_UNLOCK(ipbmap);
897
898 return (rc);
899
900 read_unlock:
901 IREAD_UNLOCK(ipbmap);
902
903 return (rc);
904 }
905
906 /*
907 * NAME: dbReAlloc()
908 *
909 * FUNCTION: attempt to extend a current allocation by a specified
910 * number of blocks.
911 *
912 * this routine attempts to satisfy the allocation request
913 * by first trying to extend the existing allocation in
914 * place by allocating the additional blocks as the blocks
915 * immediately following the current allocation. if these
916 * blocks are not available, this routine will attempt to
917 * allocate a new set of contiguous blocks large enough
918 * to cover the existing allocation plus the additional
919 * number of blocks required.
920 *
921 * PARAMETERS:
922 * ip - pointer to in-core inode requiring allocation.
923 * blkno - starting block of the current allocation.
924 * nblocks - number of contiguous blocks within the current
925 * allocation.
926 * addnblocks - number of blocks to add to the allocation.
927 * results - on successful return, set to the starting block number
928 * of the existing allocation if the existing allocation
929 * was extended in place or to a newly allocated contiguous
930 * range if the existing allocation could not be extended
931 * in place.
932 *
933 * RETURN VALUES:
934 * 0 - success
935 * -ENOSPC - insufficient disk resources
936 * -EIO - i/o error
937 */
938 int
dbReAlloc(struct inode * ip,s64 blkno,s64 nblocks,s64 addnblocks,s64 * results)939 dbReAlloc(struct inode *ip,
940 s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
941 {
942 int rc;
943
944 /* try to extend the allocation in place.
945 */
946 if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
947 *results = blkno;
948 return (0);
949 } else {
950 if (rc != -ENOSPC)
951 return (rc);
952 }
953
954 /* could not extend the allocation in place, so allocate a
955 * new set of blocks for the entire request (i.e. try to get
956 * a range of contiguous blocks large enough to cover the
957 * existing allocation plus the additional blocks.)
958 */
959 return (dbAlloc
960 (ip, blkno + nblocks - 1, addnblocks + nblocks, results));
961 }
962
963
964 /*
965 * NAME: dbExtend()
966 *
967 * FUNCTION: attempt to extend a current allocation by a specified
968 * number of blocks.
969 *
970 * this routine attempts to satisfy the allocation request
971 * by first trying to extend the existing allocation in
972 * place by allocating the additional blocks as the blocks
973 * immediately following the current allocation.
974 *
975 * PARAMETERS:
976 * ip - pointer to in-core inode requiring allocation.
977 * blkno - starting block of the current allocation.
978 * nblocks - number of contiguous blocks within the current
979 * allocation.
980 * addnblocks - number of blocks to add to the allocation.
981 *
982 * RETURN VALUES:
983 * 0 - success
984 * -ENOSPC - insufficient disk resources
985 * -EIO - i/o error
986 */
dbExtend(struct inode * ip,s64 blkno,s64 nblocks,s64 addnblocks)987 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
988 {
989 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
990 s64 lblkno, lastblkno, extblkno;
991 uint rel_block;
992 struct metapage *mp;
993 struct dmap *dp;
994 int rc;
995 struct inode *ipbmap = sbi->ipbmap;
996 struct bmap *bmp;
997
998 /*
999 * We don't want a non-aligned extent to cross a page boundary
1000 */
1001 if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
1002 (rel_block + nblocks + addnblocks > sbi->nbperpage))
1003 return -ENOSPC;
1004
1005 /* get the last block of the current allocation */
1006 lastblkno = blkno + nblocks - 1;
1007
1008 /* determine the block number of the block following
1009 * the existing allocation.
1010 */
1011 extblkno = lastblkno + 1;
1012
1013 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
1014
1015 /* better be within the file system */
1016 bmp = sbi->bmap;
1017 if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
1018 IREAD_UNLOCK(ipbmap);
1019 jfs_error(ip->i_sb, "the block is outside the filesystem\n");
1020 return -EIO;
1021 }
1022
1023 /* we'll attempt to extend the current allocation in place by
1024 * allocating the additional blocks as the blocks immediately
1025 * following the current allocation. we only try to extend the
1026 * current allocation in place if the number of additional blocks
1027 * can fit into a dmap, the last block of the current allocation
1028 * is not the last block of the file system, and the start of the
1029 * inplace extension is not on an allocation group boundary.
1030 */
1031 if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
1032 (extblkno & (bmp->db_agsize - 1)) == 0) {
1033 IREAD_UNLOCK(ipbmap);
1034 return -ENOSPC;
1035 }
1036
1037 /* get the buffer for the dmap containing the first block
1038 * of the extension.
1039 */
1040 lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
1041 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
1042 if (mp == NULL) {
1043 IREAD_UNLOCK(ipbmap);
1044 return -EIO;
1045 }
1046
1047 dp = (struct dmap *) mp->data;
1048
1049 /* try to allocate the blocks immediately following the
1050 * current allocation.
1051 */
1052 rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
1053
1054 IREAD_UNLOCK(ipbmap);
1055
1056 /* were we successful ? */
1057 if (rc == 0)
1058 write_metapage(mp);
1059 else
1060 /* we were not successful */
1061 release_metapage(mp);
1062
1063 return (rc);
1064 }
1065
1066
1067 /*
1068 * NAME: dbAllocNext()
1069 *
1070 * FUNCTION: attempt to allocate the blocks of the specified block
1071 * range within a dmap.
1072 *
1073 * PARAMETERS:
1074 * bmp - pointer to bmap descriptor
1075 * dp - pointer to dmap.
1076 * blkno - starting block number of the range.
1077 * nblocks - number of contiguous free blocks of the range.
1078 *
1079 * RETURN VALUES:
1080 * 0 - success
1081 * -ENOSPC - insufficient disk resources
1082 * -EIO - i/o error
1083 *
1084 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1085 */
dbAllocNext(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)1086 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1087 int nblocks)
1088 {
1089 int dbitno, word, rembits, nb, nwords, wbitno, nw;
1090 int l2size;
1091 s8 *leaf;
1092 u32 mask;
1093
1094 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1095 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
1096 return -EIO;
1097 }
1098
1099 /* pick up a pointer to the leaves of the dmap tree.
1100 */
1101 leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
1102
1103 /* determine the bit number and word within the dmap of the
1104 * starting block.
1105 */
1106 dbitno = blkno & (BPERDMAP - 1);
1107 word = dbitno >> L2DBWORD;
1108
1109 /* check if the specified block range is contained within
1110 * this dmap.
1111 */
1112 if (dbitno + nblocks > BPERDMAP)
1113 return -ENOSPC;
1114
1115 /* check if the starting leaf indicates that anything
1116 * is free.
1117 */
1118 if (leaf[word] == NOFREE)
1119 return -ENOSPC;
1120
1121 /* check the dmaps words corresponding to block range to see
1122 * if the block range is free. not all bits of the first and
1123 * last words may be contained within the block range. if this
1124 * is the case, we'll work against those words (i.e. partial first
1125 * and/or last) on an individual basis (a single pass) and examine
1126 * the actual bits to determine if they are free. a single pass
1127 * will be used for all dmap words fully contained within the
1128 * specified range. within this pass, the leaves of the dmap
1129 * tree will be examined to determine if the blocks are free. a
1130 * single leaf may describe the free space of multiple dmap
1131 * words, so we may visit only a subset of the actual leaves
1132 * corresponding to the dmap words of the block range.
1133 */
1134 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
1135 /* determine the bit number within the word and
1136 * the number of bits within the word.
1137 */
1138 wbitno = dbitno & (DBWORD - 1);
1139 nb = min(rembits, DBWORD - wbitno);
1140
1141 /* check if only part of the word is to be examined.
1142 */
1143 if (nb < DBWORD) {
1144 /* check if the bits are free.
1145 */
1146 mask = (ONES << (DBWORD - nb) >> wbitno);
1147 if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
1148 return -ENOSPC;
1149
1150 word += 1;
1151 } else {
1152 /* one or more dmap words are fully contained
1153 * within the block range. determine how many
1154 * words and how many bits.
1155 */
1156 nwords = rembits >> L2DBWORD;
1157 nb = nwords << L2DBWORD;
1158
1159 /* now examine the appropriate leaves to determine
1160 * if the blocks are free.
1161 */
1162 while (nwords > 0) {
1163 /* does the leaf describe any free space ?
1164 */
1165 if (leaf[word] < BUDMIN)
1166 return -ENOSPC;
1167
1168 /* determine the l2 number of bits provided
1169 * by this leaf.
1170 */
1171 l2size =
1172 min_t(int, leaf[word], NLSTOL2BSZ(nwords));
1173
1174 /* determine how many words were handled.
1175 */
1176 nw = BUDSIZE(l2size, BUDMIN);
1177
1178 nwords -= nw;
1179 word += nw;
1180 }
1181 }
1182 }
1183
1184 /* allocate the blocks.
1185 */
1186 return (dbAllocDmap(bmp, dp, blkno, nblocks));
1187 }
1188
1189
1190 /*
1191 * NAME: dbAllocNear()
1192 *
1193 * FUNCTION: attempt to allocate a number of contiguous free blocks near
1194 * a specified block (hint) within a dmap.
1195 *
1196 * starting with the dmap leaf that covers the hint, we'll
1197 * check the next four contiguous leaves for sufficient free
1198 * space. if sufficient free space is found, we'll allocate
1199 * the desired free space.
1200 *
1201 * PARAMETERS:
1202 * bmp - pointer to bmap descriptor
1203 * dp - pointer to dmap.
1204 * blkno - block number to allocate near.
1205 * nblocks - actual number of contiguous free blocks desired.
1206 * l2nb - log2 number of contiguous free blocks desired.
1207 * results - on successful return, set to the starting block number
1208 * of the newly allocated range.
1209 *
1210 * RETURN VALUES:
1211 * 0 - success
1212 * -ENOSPC - insufficient disk resources
1213 * -EIO - i/o error
1214 *
1215 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1216 */
1217 static int
dbAllocNear(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks,int l2nb,s64 * results)1218 dbAllocNear(struct bmap * bmp,
1219 struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
1220 {
1221 int word, lword, rc;
1222 s8 *leaf;
1223
1224 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1225 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
1226 return -EIO;
1227 }
1228
1229 leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
1230
1231 /* determine the word within the dmap that holds the hint
1232 * (i.e. blkno). also, determine the last word in the dmap
1233 * that we'll include in our examination.
1234 */
1235 word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
1236 lword = min(word + 4, LPERDMAP);
1237
1238 /* examine the leaves for sufficient free space.
1239 */
1240 for (; word < lword; word++) {
1241 /* does the leaf describe sufficient free space ?
1242 */
1243 if (leaf[word] < l2nb)
1244 continue;
1245
1246 /* determine the block number within the file system
1247 * of the first block described by this dmap word.
1248 */
1249 blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
1250
1251 /* if not all bits of the dmap word are free, get the
1252 * starting bit number within the dmap word of the required
1253 * string of free bits and adjust the block number with the
1254 * value.
1255 */
1256 if (leaf[word] < BUDMIN)
1257 blkno +=
1258 dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
1259
1260 /* allocate the blocks.
1261 */
1262 if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
1263 *results = blkno;
1264
1265 return (rc);
1266 }
1267
1268 return -ENOSPC;
1269 }
1270
1271
1272 /*
1273 * NAME: dbAllocAG()
1274 *
1275 * FUNCTION: attempt to allocate the specified number of contiguous
1276 * free blocks within the specified allocation group.
1277 *
1278 * unless the allocation group size is equal to the number
1279 * of blocks per dmap, the dmap control pages will be used to
1280 * find the required free space, if available. we start the
1281 * search at the highest dmap control page level which
1282 * distinctly describes the allocation group's free space
1283 * (i.e. the highest level at which the allocation group's
1284 * free space is not mixed in with that of any other group).
1285 * in addition, we start the search within this level at a
1286 * height of the dmapctl dmtree at which the nodes distinctly
1287 * describe the allocation group's free space. at this height,
1288 * the allocation group's free space may be represented by 1
1289 * or two sub-trees, depending on the allocation group size.
1290 * we search the top nodes of these subtrees left to right for
1291 * sufficient free space. if sufficient free space is found,
1292 * the subtree is searched to find the leftmost leaf that
1293 * has free space. once we have made it to the leaf, we
1294 * move the search to the next lower level dmap control page
1295 * corresponding to this leaf. we continue down the dmap control
1296 * pages until we find the dmap that contains or starts the
1297 * sufficient free space and we allocate at this dmap.
1298 *
1299 * if the allocation group size is equal to the dmap size,
1300 * we'll start at the dmap corresponding to the allocation
1301 * group and attempt the allocation at this level.
1302 *
1303 * the dmap control page search is also not performed if the
1304 * allocation group is completely free and we go to the first
1305 * dmap of the allocation group to do the allocation. this is
1306 * done because the allocation group may be part (not the first
1307 * part) of a larger binary buddy system, causing the dmap
1308 * control pages to indicate no free space (NOFREE) within
1309 * the allocation group.
1310 *
1311 * PARAMETERS:
1312 * bmp - pointer to bmap descriptor
1313 * agno - allocation group number.
1314 * nblocks - actual number of contiguous free blocks desired.
1315 * l2nb - log2 number of contiguous free blocks desired.
1316 * results - on successful return, set to the starting block number
1317 * of the newly allocated range.
1318 *
1319 * RETURN VALUES:
1320 * 0 - success
1321 * -ENOSPC - insufficient disk resources
1322 * -EIO - i/o error
1323 *
1324 * note: IWRITE_LOCK(ipmap) held on entry/exit;
1325 */
1326 static int
dbAllocAG(struct bmap * bmp,int agno,s64 nblocks,int l2nb,s64 * results)1327 dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1328 {
1329 struct metapage *mp;
1330 struct dmapctl *dcp;
1331 int rc, ti, i, k, m, n, agperlev;
1332 s64 blkno, lblkno;
1333 int budmin;
1334
1335 /* allocation request should not be for more than the
1336 * allocation group size.
1337 */
1338 if (l2nb > bmp->db_agl2size) {
1339 jfs_error(bmp->db_ipbmap->i_sb,
1340 "allocation request is larger than the allocation group size\n");
1341 return -EIO;
1342 }
1343
1344 /* determine the starting block number of the allocation
1345 * group.
1346 */
1347 blkno = (s64) agno << bmp->db_agl2size;
1348
1349 /* check if the allocation group size is the minimum allocation
1350 * group size or if the allocation group is completely free. if
1351 * the allocation group size is the minimum size of BPERDMAP (i.e.
1352 * 1 dmap), there is no need to search the dmap control page (below)
1353 * that fully describes the allocation group since the allocation
1354 * group is already fully described by a dmap. in this case, we
1355 * just call dbAllocCtl() to search the dmap tree and allocate the
1356 * required space if available.
1357 *
1358 * if the allocation group is completely free, dbAllocCtl() is
1359 * also called to allocate the required space. this is done for
1360 * two reasons. first, it makes no sense searching the dmap control
1361 * pages for free space when we know that free space exists. second,
1362 * the dmap control pages may indicate that the allocation group
1363 * has no free space if the allocation group is part (not the first
1364 * part) of a larger binary buddy system.
1365 */
1366 if (bmp->db_agsize == BPERDMAP
1367 || bmp->db_agfree[agno] == bmp->db_agsize) {
1368 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1369 if ((rc == -ENOSPC) &&
1370 (bmp->db_agfree[agno] == bmp->db_agsize)) {
1371 printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n",
1372 (unsigned long long) blkno,
1373 (unsigned long long) nblocks);
1374 jfs_error(bmp->db_ipbmap->i_sb,
1375 "dbAllocCtl failed in free AG\n");
1376 }
1377 return (rc);
1378 }
1379
1380 /* the buffer for the dmap control page that fully describes the
1381 * allocation group.
1382 */
1383 lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
1384 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1385 if (mp == NULL)
1386 return -EIO;
1387 dcp = (struct dmapctl *) mp->data;
1388 budmin = dcp->budmin;
1389
1390 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1391 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
1392 release_metapage(mp);
1393 return -EIO;
1394 }
1395
1396 /* search the subtree(s) of the dmap control page that describes
1397 * the allocation group, looking for sufficient free space. to begin,
1398 * determine how many allocation groups are represented in a dmap
1399 * control page at the control page level (i.e. L0, L1, L2) that
1400 * fully describes an allocation group. next, determine the starting
1401 * tree index of this allocation group within the control page.
1402 */
1403 agperlev =
1404 (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
1405 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
1406
1407 /* dmap control page trees fan-out by 4 and a single allocation
1408 * group may be described by 1 or 2 subtrees within the ag level
1409 * dmap control page, depending upon the ag size. examine the ag's
1410 * subtrees for sufficient free space, starting with the leftmost
1411 * subtree.
1412 */
1413 for (i = 0; i < bmp->db_agwidth; i++, ti++) {
1414 /* is there sufficient free space ?
1415 */
1416 if (l2nb > dcp->stree[ti])
1417 continue;
1418
1419 /* sufficient free space found in a subtree. now search down
1420 * the subtree to find the leftmost leaf that describes this
1421 * free space.
1422 */
1423 for (k = bmp->db_agheight; k > 0; k--) {
1424 for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
1425 if (l2nb <= dcp->stree[m + n]) {
1426 ti = m + n;
1427 break;
1428 }
1429 }
1430 if (n == 4) {
1431 jfs_error(bmp->db_ipbmap->i_sb,
1432 "failed descending stree\n");
1433 release_metapage(mp);
1434 return -EIO;
1435 }
1436 }
1437
1438 /* determine the block number within the file system
1439 * that corresponds to this leaf.
1440 */
1441 if (bmp->db_aglevel == 2)
1442 blkno = 0;
1443 else if (bmp->db_aglevel == 1)
1444 blkno &= ~(MAXL1SIZE - 1);
1445 else /* bmp->db_aglevel == 0 */
1446 blkno &= ~(MAXL0SIZE - 1);
1447
1448 blkno +=
1449 ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
1450
1451 /* release the buffer in preparation for going down
1452 * the next level of dmap control pages.
1453 */
1454 release_metapage(mp);
1455
1456 /* check if we need to continue to search down the lower
1457 * level dmap control pages. we need to if the number of
1458 * blocks required is less than maximum number of blocks
1459 * described at the next lower level.
1460 */
1461 if (l2nb < budmin) {
1462
1463 /* search the lower level dmap control pages to get
1464 * the starting block number of the dmap that
1465 * contains or starts off the free space.
1466 */
1467 if ((rc =
1468 dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
1469 &blkno))) {
1470 if (rc == -ENOSPC) {
1471 jfs_error(bmp->db_ipbmap->i_sb,
1472 "control page inconsistent\n");
1473 return -EIO;
1474 }
1475 return (rc);
1476 }
1477 }
1478
1479 /* allocate the blocks.
1480 */
1481 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1482 if (rc == -ENOSPC) {
1483 jfs_error(bmp->db_ipbmap->i_sb,
1484 "unable to allocate blocks\n");
1485 rc = -EIO;
1486 }
1487 return (rc);
1488 }
1489
1490 /* no space in the allocation group. release the buffer and
1491 * return -ENOSPC.
1492 */
1493 release_metapage(mp);
1494
1495 return -ENOSPC;
1496 }
1497
1498
1499 /*
1500 * NAME: dbAllocAny()
1501 *
1502 * FUNCTION: attempt to allocate the specified number of contiguous
1503 * free blocks anywhere in the file system.
1504 *
1505 * dbAllocAny() attempts to find the sufficient free space by
1506 * searching down the dmap control pages, starting with the
1507 * highest level (i.e. L0, L1, L2) control page. if free space
1508 * large enough to satisfy the desired free space is found, the
1509 * desired free space is allocated.
1510 *
1511 * PARAMETERS:
1512 * bmp - pointer to bmap descriptor
1513 * nblocks - actual number of contiguous free blocks desired.
1514 * l2nb - log2 number of contiguous free blocks desired.
1515 * results - on successful return, set to the starting block number
1516 * of the newly allocated range.
1517 *
1518 * RETURN VALUES:
1519 * 0 - success
1520 * -ENOSPC - insufficient disk resources
1521 * -EIO - i/o error
1522 *
1523 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1524 */
dbAllocAny(struct bmap * bmp,s64 nblocks,int l2nb,s64 * results)1525 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1526 {
1527 int rc;
1528 s64 blkno = 0;
1529
1530 /* starting with the top level dmap control page, search
1531 * down the dmap control levels for sufficient free space.
1532 * if free space is found, dbFindCtl() returns the starting
1533 * block number of the dmap that contains or starts off the
1534 * range of free space.
1535 */
1536 if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
1537 return (rc);
1538
1539 /* allocate the blocks.
1540 */
1541 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1542 if (rc == -ENOSPC) {
1543 jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n");
1544 return -EIO;
1545 }
1546 return (rc);
1547 }
1548
1549
1550 /*
1551 * NAME: dbDiscardAG()
1552 *
1553 * FUNCTION: attempt to discard (TRIM) all free blocks of specific AG
1554 *
1555 * algorithm:
1556 * 1) allocate blocks, as large as possible and save them
1557 * while holding IWRITE_LOCK on ipbmap
1558 * 2) trim all these saved block/length values
1559 * 3) mark the blocks free again
1560 *
1561 * benefit:
1562 * - we work only on one ag at some time, minimizing how long we
1563 * need to lock ipbmap
1564 * - reading / writing the fs is possible most time, even on
1565 * trimming
1566 *
1567 * downside:
1568 * - we write two times to the dmapctl and dmap pages
1569 * - but for me, this seems the best way, better ideas?
1570 * /TR 2012
1571 *
1572 * PARAMETERS:
1573 * ip - pointer to in-core inode
1574 * agno - ag to trim
1575 * minlen - minimum value of contiguous blocks
1576 *
1577 * RETURN VALUES:
1578 * s64 - actual number of blocks trimmed
1579 */
dbDiscardAG(struct inode * ip,int agno,s64 minlen)1580 s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
1581 {
1582 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
1583 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
1584 s64 nblocks, blkno;
1585 u64 trimmed = 0;
1586 int rc, l2nb;
1587 struct super_block *sb = ipbmap->i_sb;
1588
1589 struct range2trim {
1590 u64 blkno;
1591 u64 nblocks;
1592 } *totrim, *tt;
1593
1594 /* max blkno / nblocks pairs to trim */
1595 int count = 0, range_cnt;
1596 u64 max_ranges;
1597
1598 /* prevent others from writing new stuff here, while trimming */
1599 IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
1600
1601 nblocks = bmp->db_agfree[agno];
1602 max_ranges = nblocks;
1603 do_div(max_ranges, minlen);
1604 range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
1605 totrim = kmalloc_array(range_cnt, sizeof(struct range2trim), GFP_NOFS);
1606 if (totrim == NULL) {
1607 jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n");
1608 IWRITE_UNLOCK(ipbmap);
1609 return 0;
1610 }
1611
1612 tt = totrim;
1613 while (nblocks >= minlen) {
1614 l2nb = BLKSTOL2(nblocks);
1615
1616 /* 0 = okay, -EIO = fatal, -ENOSPC -> try smaller block */
1617 rc = dbAllocAG(bmp, agno, nblocks, l2nb, &blkno);
1618 if (rc == 0) {
1619 tt->blkno = blkno;
1620 tt->nblocks = nblocks;
1621 tt++; count++;
1622
1623 /* the whole ag is free, trim now */
1624 if (bmp->db_agfree[agno] == 0)
1625 break;
1626
1627 /* give a hint for the next while */
1628 nblocks = bmp->db_agfree[agno];
1629 continue;
1630 } else if (rc == -ENOSPC) {
1631 /* search for next smaller log2 block */
1632 l2nb = BLKSTOL2(nblocks) - 1;
1633 if (unlikely(l2nb < 0))
1634 break;
1635 nblocks = 1LL << l2nb;
1636 } else {
1637 /* Trim any already allocated blocks */
1638 jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
1639 break;
1640 }
1641
1642 /* check, if our trim array is full */
1643 if (unlikely(count >= range_cnt - 1))
1644 break;
1645 }
1646 IWRITE_UNLOCK(ipbmap);
1647
1648 tt->nblocks = 0; /* mark the current end */
1649 for (tt = totrim; tt->nblocks != 0; tt++) {
1650 /* when mounted with online discard, dbFree() will
1651 * call jfs_issue_discard() itself */
1652 if (!(JFS_SBI(sb)->flag & JFS_DISCARD))
1653 jfs_issue_discard(ip, tt->blkno, tt->nblocks);
1654 dbFree(ip, tt->blkno, tt->nblocks);
1655 trimmed += tt->nblocks;
1656 }
1657 kfree(totrim);
1658
1659 return trimmed;
1660 }
1661
1662 /*
1663 * NAME: dbFindCtl()
1664 *
1665 * FUNCTION: starting at a specified dmap control page level and block
1666 * number, search down the dmap control levels for a range of
1667 * contiguous free blocks large enough to satisfy an allocation
1668 * request for the specified number of free blocks.
1669 *
1670 * if sufficient contiguous free blocks are found, this routine
1671 * returns the starting block number within a dmap page that
1672 * contains or starts a range of contiqious free blocks that
1673 * is sufficient in size.
1674 *
1675 * PARAMETERS:
1676 * bmp - pointer to bmap descriptor
1677 * level - starting dmap control page level.
1678 * l2nb - log2 number of contiguous free blocks desired.
1679 * *blkno - on entry, starting block number for conducting the search.
1680 * on successful return, the first block within a dmap page
1681 * that contains or starts a range of contiguous free blocks.
1682 *
1683 * RETURN VALUES:
1684 * 0 - success
1685 * -ENOSPC - insufficient disk resources
1686 * -EIO - i/o error
1687 *
1688 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1689 */
dbFindCtl(struct bmap * bmp,int l2nb,int level,s64 * blkno)1690 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1691 {
1692 int rc, leafidx, lev;
1693 s64 b, lblkno;
1694 struct dmapctl *dcp;
1695 int budmin;
1696 struct metapage *mp;
1697
1698 /* starting at the specified dmap control page level and block
1699 * number, search down the dmap control levels for the starting
1700 * block number of a dmap page that contains or starts off
1701 * sufficient free blocks.
1702 */
1703 for (lev = level, b = *blkno; lev >= 0; lev--) {
1704 /* get the buffer of the dmap control page for the block
1705 * number and level (i.e. L0, L1, L2).
1706 */
1707 lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
1708 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1709 if (mp == NULL)
1710 return -EIO;
1711 dcp = (struct dmapctl *) mp->data;
1712 budmin = dcp->budmin;
1713
1714 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1715 jfs_error(bmp->db_ipbmap->i_sb,
1716 "Corrupt dmapctl page\n");
1717 release_metapage(mp);
1718 return -EIO;
1719 }
1720
1721 /* search the tree within the dmap control page for
1722 * sufficient free space. if sufficient free space is found,
1723 * dbFindLeaf() returns the index of the leaf at which
1724 * free space was found.
1725 */
1726 rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx, true);
1727
1728 /* release the buffer.
1729 */
1730 release_metapage(mp);
1731
1732 /* space found ?
1733 */
1734 if (rc) {
1735 if (lev != level) {
1736 jfs_error(bmp->db_ipbmap->i_sb,
1737 "dmap inconsistent\n");
1738 return -EIO;
1739 }
1740 return -ENOSPC;
1741 }
1742
1743 /* adjust the block number to reflect the location within
1744 * the dmap control page (i.e. the leaf) at which free
1745 * space was found.
1746 */
1747 b += (((s64) leafidx) << budmin);
1748
1749 /* we stop the search at this dmap control page level if
1750 * the number of blocks required is greater than or equal
1751 * to the maximum number of blocks described at the next
1752 * (lower) level.
1753 */
1754 if (l2nb >= budmin)
1755 break;
1756 }
1757
1758 *blkno = b;
1759 return (0);
1760 }
1761
1762
1763 /*
1764 * NAME: dbAllocCtl()
1765 *
1766 * FUNCTION: attempt to allocate a specified number of contiguous
1767 * blocks starting within a specific dmap.
1768 *
1769 * this routine is called by higher level routines that search
1770 * the dmap control pages above the actual dmaps for contiguous
1771 * free space. the result of successful searches by these
1772 * routines are the starting block numbers within dmaps, with
1773 * the dmaps themselves containing the desired contiguous free
1774 * space or starting a contiguous free space of desired size
1775 * that is made up of the blocks of one or more dmaps. these
1776 * calls should not fail due to insufficent resources.
1777 *
1778 * this routine is called in some cases where it is not known
1779 * whether it will fail due to insufficient resources. more
1780 * specifically, this occurs when allocating from an allocation
1781 * group whose size is equal to the number of blocks per dmap.
1782 * in this case, the dmap control pages are not examined prior
1783 * to calling this routine (to save pathlength) and the call
1784 * might fail.
1785 *
1786 * for a request size that fits within a dmap, this routine relies
1787 * upon the dmap's dmtree to find the requested contiguous free
1788 * space. for request sizes that are larger than a dmap, the
1789 * requested free space will start at the first block of the
1790 * first dmap (i.e. blkno).
1791 *
1792 * PARAMETERS:
1793 * bmp - pointer to bmap descriptor
1794 * nblocks - actual number of contiguous free blocks to allocate.
1795 * l2nb - log2 number of contiguous free blocks to allocate.
1796 * blkno - starting block number of the dmap to start the allocation
1797 * from.
1798 * results - on successful return, set to the starting block number
1799 * of the newly allocated range.
1800 *
1801 * RETURN VALUES:
1802 * 0 - success
1803 * -ENOSPC - insufficient disk resources
1804 * -EIO - i/o error
1805 *
1806 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1807 */
1808 static int
dbAllocCtl(struct bmap * bmp,s64 nblocks,int l2nb,s64 blkno,s64 * results)1809 dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1810 {
1811 int rc, nb;
1812 s64 b, lblkno, n;
1813 struct metapage *mp;
1814 struct dmap *dp;
1815
1816 /* check if the allocation request is confined to a single dmap.
1817 */
1818 if (l2nb <= L2BPERDMAP) {
1819 /* get the buffer for the dmap.
1820 */
1821 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
1822 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1823 if (mp == NULL)
1824 return -EIO;
1825 dp = (struct dmap *) mp->data;
1826
1827 if (dp->tree.budmin < 0)
1828 return -EIO;
1829
1830 /* try to allocate the blocks.
1831 */
1832 rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
1833 if (rc == 0)
1834 mark_metapage_dirty(mp);
1835
1836 release_metapage(mp);
1837
1838 return (rc);
1839 }
1840
1841 /* allocation request involving multiple dmaps. it must start on
1842 * a dmap boundary.
1843 */
1844 assert((blkno & (BPERDMAP - 1)) == 0);
1845
1846 /* allocate the blocks dmap by dmap.
1847 */
1848 for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
1849 /* get the buffer for the dmap.
1850 */
1851 lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
1852 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1853 if (mp == NULL) {
1854 rc = -EIO;
1855 goto backout;
1856 }
1857 dp = (struct dmap *) mp->data;
1858
1859 /* the dmap better be all free.
1860 */
1861 if (dp->tree.stree[ROOT] != L2BPERDMAP) {
1862 release_metapage(mp);
1863 jfs_error(bmp->db_ipbmap->i_sb,
1864 "the dmap is not all free\n");
1865 rc = -EIO;
1866 goto backout;
1867 }
1868
1869 /* determine how many blocks to allocate from this dmap.
1870 */
1871 nb = min_t(s64, n, BPERDMAP);
1872
1873 /* allocate the blocks from the dmap.
1874 */
1875 if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
1876 release_metapage(mp);
1877 goto backout;
1878 }
1879
1880 /* write the buffer.
1881 */
1882 write_metapage(mp);
1883 }
1884
1885 /* set the results (starting block number) and return.
1886 */
1887 *results = blkno;
1888 return (0);
1889
1890 /* something failed in handling an allocation request involving
1891 * multiple dmaps. we'll try to clean up by backing out any
1892 * allocation that has already happened for this request. if
1893 * we fail in backing out the allocation, we'll mark the file
1894 * system to indicate that blocks have been leaked.
1895 */
1896 backout:
1897
1898 /* try to backout the allocations dmap by dmap.
1899 */
1900 for (n = nblocks - n, b = blkno; n > 0;
1901 n -= BPERDMAP, b += BPERDMAP) {
1902 /* get the buffer for this dmap.
1903 */
1904 lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
1905 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1906 if (mp == NULL) {
1907 /* could not back out. mark the file system
1908 * to indicate that we have leaked blocks.
1909 */
1910 jfs_error(bmp->db_ipbmap->i_sb,
1911 "I/O Error: Block Leakage\n");
1912 continue;
1913 }
1914 dp = (struct dmap *) mp->data;
1915
1916 /* free the blocks is this dmap.
1917 */
1918 if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
1919 /* could not back out. mark the file system
1920 * to indicate that we have leaked blocks.
1921 */
1922 release_metapage(mp);
1923 jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n");
1924 continue;
1925 }
1926
1927 /* write the buffer.
1928 */
1929 write_metapage(mp);
1930 }
1931
1932 return (rc);
1933 }
1934
1935
1936 /*
1937 * NAME: dbAllocDmapLev()
1938 *
1939 * FUNCTION: attempt to allocate a specified number of contiguous blocks
1940 * from a specified dmap.
1941 *
1942 * this routine checks if the contiguous blocks are available.
1943 * if so, nblocks of blocks are allocated; otherwise, ENOSPC is
1944 * returned.
1945 *
1946 * PARAMETERS:
1947 * mp - pointer to bmap descriptor
1948 * dp - pointer to dmap to attempt to allocate blocks from.
1949 * l2nb - log2 number of contiguous block desired.
1950 * nblocks - actual number of contiguous block desired.
1951 * results - on successful return, set to the starting block number
1952 * of the newly allocated range.
1953 *
1954 * RETURN VALUES:
1955 * 0 - success
1956 * -ENOSPC - insufficient disk resources
1957 * -EIO - i/o error
1958 *
1959 * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
1960 * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
1961 */
1962 static int
dbAllocDmapLev(struct bmap * bmp,struct dmap * dp,int nblocks,int l2nb,s64 * results)1963 dbAllocDmapLev(struct bmap * bmp,
1964 struct dmap * dp, int nblocks, int l2nb, s64 * results)
1965 {
1966 s64 blkno;
1967 int leafidx, rc;
1968
1969 /* can't be more than a dmaps worth of blocks */
1970 assert(l2nb <= L2BPERDMAP);
1971
1972 /* search the tree within the dmap page for sufficient
1973 * free space. if sufficient free space is found, dbFindLeaf()
1974 * returns the index of the leaf at which free space was found.
1975 */
1976 if (dbFindLeaf((dmtree_t *) &dp->tree, l2nb, &leafidx, false))
1977 return -ENOSPC;
1978
1979 if (leafidx < 0)
1980 return -EIO;
1981
1982 /* determine the block number within the file system corresponding
1983 * to the leaf at which free space was found.
1984 */
1985 blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
1986
1987 /* if not all bits of the dmap word are free, get the starting
1988 * bit number within the dmap word of the required string of free
1989 * bits and adjust the block number with this value.
1990 */
1991 if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
1992 blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
1993
1994 /* allocate the blocks */
1995 if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
1996 *results = blkno;
1997
1998 return (rc);
1999 }
2000
2001
2002 /*
2003 * NAME: dbAllocDmap()
2004 *
2005 * FUNCTION: adjust the disk allocation map to reflect the allocation
2006 * of a specified block range within a dmap.
2007 *
2008 * this routine allocates the specified blocks from the dmap
2009 * through a call to dbAllocBits(). if the allocation of the
2010 * block range causes the maximum string of free blocks within
2011 * the dmap to change (i.e. the value of the root of the dmap's
2012 * dmtree), this routine will cause this change to be reflected
2013 * up through the appropriate levels of the dmap control pages
2014 * by a call to dbAdjCtl() for the L0 dmap control page that
2015 * covers this dmap.
2016 *
2017 * PARAMETERS:
2018 * bmp - pointer to bmap descriptor
2019 * dp - pointer to dmap to allocate the block range from.
2020 * blkno - starting block number of the block to be allocated.
2021 * nblocks - number of blocks to be allocated.
2022 *
2023 * RETURN VALUES:
2024 * 0 - success
2025 * -EIO - i/o error
2026 *
2027 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2028 */
dbAllocDmap(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)2029 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2030 int nblocks)
2031 {
2032 s8 oldroot;
2033 int rc;
2034
2035 /* save the current value of the root (i.e. maximum free string)
2036 * of the dmap tree.
2037 */
2038 oldroot = dp->tree.stree[ROOT];
2039
2040 /* allocate the specified (blocks) bits */
2041 dbAllocBits(bmp, dp, blkno, nblocks);
2042
2043 /* if the root has not changed, done. */
2044 if (dp->tree.stree[ROOT] == oldroot)
2045 return (0);
2046
2047 /* root changed. bubble the change up to the dmap control pages.
2048 * if the adjustment of the upper level control pages fails,
2049 * backout the bit allocation (thus making everything consistent).
2050 */
2051 if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
2052 dbFreeBits(bmp, dp, blkno, nblocks);
2053
2054 return (rc);
2055 }
2056
2057
2058 /*
2059 * NAME: dbFreeDmap()
2060 *
2061 * FUNCTION: adjust the disk allocation map to reflect the allocation
2062 * of a specified block range within a dmap.
2063 *
2064 * this routine frees the specified blocks from the dmap through
2065 * a call to dbFreeBits(). if the deallocation of the block range
2066 * causes the maximum string of free blocks within the dmap to
2067 * change (i.e. the value of the root of the dmap's dmtree), this
2068 * routine will cause this change to be reflected up through the
2069 * appropriate levels of the dmap control pages by a call to
2070 * dbAdjCtl() for the L0 dmap control page that covers this dmap.
2071 *
2072 * PARAMETERS:
2073 * bmp - pointer to bmap descriptor
2074 * dp - pointer to dmap to free the block range from.
2075 * blkno - starting block number of the block to be freed.
2076 * nblocks - number of blocks to be freed.
2077 *
2078 * RETURN VALUES:
2079 * 0 - success
2080 * -EIO - i/o error
2081 *
2082 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2083 */
dbFreeDmap(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)2084 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2085 int nblocks)
2086 {
2087 s8 oldroot;
2088 int rc = 0, word;
2089
2090 /* save the current value of the root (i.e. maximum free string)
2091 * of the dmap tree.
2092 */
2093 oldroot = dp->tree.stree[ROOT];
2094
2095 /* free the specified (blocks) bits */
2096 rc = dbFreeBits(bmp, dp, blkno, nblocks);
2097
2098 /* if error or the root has not changed, done. */
2099 if (rc || (dp->tree.stree[ROOT] == oldroot))
2100 return (rc);
2101
2102 /* root changed. bubble the change up to the dmap control pages.
2103 * if the adjustment of the upper level control pages fails,
2104 * backout the deallocation.
2105 */
2106 if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
2107 word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
2108
2109 /* as part of backing out the deallocation, we will have
2110 * to back split the dmap tree if the deallocation caused
2111 * the freed blocks to become part of a larger binary buddy
2112 * system.
2113 */
2114 if (dp->tree.stree[word] == NOFREE)
2115 dbBackSplit((dmtree_t *)&dp->tree, word, false);
2116
2117 dbAllocBits(bmp, dp, blkno, nblocks);
2118 }
2119
2120 return (rc);
2121 }
2122
2123
2124 /*
2125 * NAME: dbAllocBits()
2126 *
2127 * FUNCTION: allocate a specified block range from a dmap.
2128 *
2129 * this routine updates the dmap to reflect the working
2130 * state allocation of the specified block range. it directly
2131 * updates the bits of the working map and causes the adjustment
2132 * of the binary buddy system described by the dmap's dmtree
2133 * leaves to reflect the bits allocated. it also causes the
2134 * dmap's dmtree, as a whole, to reflect the allocated range.
2135 *
2136 * PARAMETERS:
2137 * bmp - pointer to bmap descriptor
2138 * dp - pointer to dmap to allocate bits from.
2139 * blkno - starting block number of the bits to be allocated.
2140 * nblocks - number of bits to be allocated.
2141 *
2142 * RETURN VALUES: none
2143 *
2144 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2145 */
dbAllocBits(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)2146 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2147 int nblocks)
2148 {
2149 int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
2150 dmtree_t *tp = (dmtree_t *) & dp->tree;
2151 int size;
2152 s8 *leaf;
2153
2154 /* pick up a pointer to the leaves of the dmap tree */
2155 leaf = dp->tree.stree + LEAFIND;
2156
2157 /* determine the bit number and word within the dmap of the
2158 * starting block.
2159 */
2160 dbitno = blkno & (BPERDMAP - 1);
2161 word = dbitno >> L2DBWORD;
2162
2163 /* block range better be within the dmap */
2164 assert(dbitno + nblocks <= BPERDMAP);
2165
2166 /* allocate the bits of the dmap's words corresponding to the block
2167 * range. not all bits of the first and last words may be contained
2168 * within the block range. if this is the case, we'll work against
2169 * those words (i.e. partial first and/or last) on an individual basis
2170 * (a single pass), allocating the bits of interest by hand and
2171 * updating the leaf corresponding to the dmap word. a single pass
2172 * will be used for all dmap words fully contained within the
2173 * specified range. within this pass, the bits of all fully contained
2174 * dmap words will be marked as free in a single shot and the leaves
2175 * will be updated. a single leaf may describe the free space of
2176 * multiple dmap words, so we may update only a subset of the actual
2177 * leaves corresponding to the dmap words of the block range.
2178 */
2179 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
2180 /* determine the bit number within the word and
2181 * the number of bits within the word.
2182 */
2183 wbitno = dbitno & (DBWORD - 1);
2184 nb = min(rembits, DBWORD - wbitno);
2185
2186 /* check if only part of a word is to be allocated.
2187 */
2188 if (nb < DBWORD) {
2189 /* allocate (set to 1) the appropriate bits within
2190 * this dmap word.
2191 */
2192 dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
2193 >> wbitno);
2194
2195 /* update the leaf for this dmap word. in addition
2196 * to setting the leaf value to the binary buddy max
2197 * of the updated dmap word, dbSplit() will split
2198 * the binary system of the leaves if need be.
2199 */
2200 dbSplit(tp, word, BUDMIN,
2201 dbMaxBud((u8 *)&dp->wmap[word]), false);
2202
2203 word += 1;
2204 } else {
2205 /* one or more dmap words are fully contained
2206 * within the block range. determine how many
2207 * words and allocate (set to 1) the bits of these
2208 * words.
2209 */
2210 nwords = rembits >> L2DBWORD;
2211 memset(&dp->wmap[word], (int) ONES, nwords * 4);
2212
2213 /* determine how many bits.
2214 */
2215 nb = nwords << L2DBWORD;
2216
2217 /* now update the appropriate leaves to reflect
2218 * the allocated words.
2219 */
2220 for (; nwords > 0; nwords -= nw) {
2221 if (leaf[word] < BUDMIN) {
2222 jfs_error(bmp->db_ipbmap->i_sb,
2223 "leaf page corrupt\n");
2224 break;
2225 }
2226
2227 /* determine what the leaf value should be
2228 * updated to as the minimum of the l2 number
2229 * of bits being allocated and the l2 number
2230 * of bits currently described by this leaf.
2231 */
2232 size = min_t(int, leaf[word],
2233 NLSTOL2BSZ(nwords));
2234
2235 /* update the leaf to reflect the allocation.
2236 * in addition to setting the leaf value to
2237 * NOFREE, dbSplit() will split the binary
2238 * system of the leaves to reflect the current
2239 * allocation (size).
2240 */
2241 dbSplit(tp, word, size, NOFREE, false);
2242
2243 /* get the number of dmap words handled */
2244 nw = BUDSIZE(size, BUDMIN);
2245 word += nw;
2246 }
2247 }
2248 }
2249
2250 /* update the free count for this dmap */
2251 le32_add_cpu(&dp->nfree, -nblocks);
2252
2253 BMAP_LOCK(bmp);
2254
2255 /* if this allocation group is completely free,
2256 * update the maximum allocation group number if this allocation
2257 * group is the new max.
2258 */
2259 agno = blkno >> bmp->db_agl2size;
2260 if (agno > bmp->db_maxag)
2261 bmp->db_maxag = agno;
2262
2263 /* update the free count for the allocation group and map */
2264 bmp->db_agfree[agno] -= nblocks;
2265 bmp->db_nfree -= nblocks;
2266
2267 BMAP_UNLOCK(bmp);
2268 }
2269
2270
2271 /*
2272 * NAME: dbFreeBits()
2273 *
2274 * FUNCTION: free a specified block range from a dmap.
2275 *
2276 * this routine updates the dmap to reflect the working
2277 * state allocation of the specified block range. it directly
2278 * updates the bits of the working map and causes the adjustment
2279 * of the binary buddy system described by the dmap's dmtree
2280 * leaves to reflect the bits freed. it also causes the dmap's
2281 * dmtree, as a whole, to reflect the deallocated range.
2282 *
2283 * PARAMETERS:
2284 * bmp - pointer to bmap descriptor
2285 * dp - pointer to dmap to free bits from.
2286 * blkno - starting block number of the bits to be freed.
2287 * nblocks - number of bits to be freed.
2288 *
2289 * RETURN VALUES: 0 for success
2290 *
2291 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2292 */
dbFreeBits(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)2293 static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2294 int nblocks)
2295 {
2296 int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
2297 dmtree_t *tp = (dmtree_t *) & dp->tree;
2298 int rc = 0;
2299 int size;
2300
2301 /* determine the bit number and word within the dmap of the
2302 * starting block.
2303 */
2304 dbitno = blkno & (BPERDMAP - 1);
2305 word = dbitno >> L2DBWORD;
2306
2307 /* block range better be within the dmap.
2308 */
2309 assert(dbitno + nblocks <= BPERDMAP);
2310
2311 /* free the bits of the dmaps words corresponding to the block range.
2312 * not all bits of the first and last words may be contained within
2313 * the block range. if this is the case, we'll work against those
2314 * words (i.e. partial first and/or last) on an individual basis
2315 * (a single pass), freeing the bits of interest by hand and updating
2316 * the leaf corresponding to the dmap word. a single pass will be used
2317 * for all dmap words fully contained within the specified range.
2318 * within this pass, the bits of all fully contained dmap words will
2319 * be marked as free in a single shot and the leaves will be updated. a
2320 * single leaf may describe the free space of multiple dmap words,
2321 * so we may update only a subset of the actual leaves corresponding
2322 * to the dmap words of the block range.
2323 *
2324 * dbJoin() is used to update leaf values and will join the binary
2325 * buddy system of the leaves if the new leaf values indicate this
2326 * should be done.
2327 */
2328 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
2329 /* determine the bit number within the word and
2330 * the number of bits within the word.
2331 */
2332 wbitno = dbitno & (DBWORD - 1);
2333 nb = min(rembits, DBWORD - wbitno);
2334
2335 /* check if only part of a word is to be freed.
2336 */
2337 if (nb < DBWORD) {
2338 /* free (zero) the appropriate bits within this
2339 * dmap word.
2340 */
2341 dp->wmap[word] &=
2342 cpu_to_le32(~(ONES << (DBWORD - nb)
2343 >> wbitno));
2344
2345 /* update the leaf for this dmap word.
2346 */
2347 rc = dbJoin(tp, word,
2348 dbMaxBud((u8 *)&dp->wmap[word]), false);
2349 if (rc)
2350 return rc;
2351
2352 word += 1;
2353 } else {
2354 /* one or more dmap words are fully contained
2355 * within the block range. determine how many
2356 * words and free (zero) the bits of these words.
2357 */
2358 nwords = rembits >> L2DBWORD;
2359 memset(&dp->wmap[word], 0, nwords * 4);
2360
2361 /* determine how many bits.
2362 */
2363 nb = nwords << L2DBWORD;
2364
2365 /* now update the appropriate leaves to reflect
2366 * the freed words.
2367 */
2368 for (; nwords > 0; nwords -= nw) {
2369 /* determine what the leaf value should be
2370 * updated to as the minimum of the l2 number
2371 * of bits being freed and the l2 (max) number
2372 * of bits that can be described by this leaf.
2373 */
2374 size =
2375 min(LITOL2BSZ
2376 (word, L2LPERDMAP, BUDMIN),
2377 NLSTOL2BSZ(nwords));
2378
2379 /* update the leaf.
2380 */
2381 rc = dbJoin(tp, word, size, false);
2382 if (rc)
2383 return rc;
2384
2385 /* get the number of dmap words handled.
2386 */
2387 nw = BUDSIZE(size, BUDMIN);
2388 word += nw;
2389 }
2390 }
2391 }
2392
2393 /* update the free count for this dmap.
2394 */
2395 le32_add_cpu(&dp->nfree, nblocks);
2396
2397 BMAP_LOCK(bmp);
2398
2399 /* update the free count for the allocation group and
2400 * map.
2401 */
2402 agno = blkno >> bmp->db_agl2size;
2403 bmp->db_nfree += nblocks;
2404 bmp->db_agfree[agno] += nblocks;
2405
2406 /* check if this allocation group is not completely free and
2407 * if it is currently the maximum (rightmost) allocation group.
2408 * if so, establish the new maximum allocation group number by
2409 * searching left for the first allocation group with allocation.
2410 */
2411 if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
2412 (agno == bmp->db_numag - 1 &&
2413 bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
2414 while (bmp->db_maxag > 0) {
2415 bmp->db_maxag -= 1;
2416 if (bmp->db_agfree[bmp->db_maxag] !=
2417 bmp->db_agsize)
2418 break;
2419 }
2420
2421 /* re-establish the allocation group preference if the
2422 * current preference is right of the maximum allocation
2423 * group.
2424 */
2425 if (bmp->db_agpref > bmp->db_maxag)
2426 bmp->db_agpref = bmp->db_maxag;
2427 }
2428
2429 BMAP_UNLOCK(bmp);
2430
2431 return 0;
2432 }
2433
2434
2435 /*
2436 * NAME: dbAdjCtl()
2437 *
2438 * FUNCTION: adjust a dmap control page at a specified level to reflect
2439 * the change in a lower level dmap or dmap control page's
2440 * maximum string of free blocks (i.e. a change in the root
2441 * of the lower level object's dmtree) due to the allocation
2442 * or deallocation of a range of blocks with a single dmap.
2443 *
2444 * on entry, this routine is provided with the new value of
2445 * the lower level dmap or dmap control page root and the
2446 * starting block number of the block range whose allocation
2447 * or deallocation resulted in the root change. this range
2448 * is respresented by a single leaf of the current dmapctl
2449 * and the leaf will be updated with this value, possibly
2450 * causing a binary buddy system within the leaves to be
2451 * split or joined. the update may also cause the dmapctl's
2452 * dmtree to be updated.
2453 *
2454 * if the adjustment of the dmap control page, itself, causes its
2455 * root to change, this change will be bubbled up to the next dmap
2456 * control level by a recursive call to this routine, specifying
2457 * the new root value and the next dmap control page level to
2458 * be adjusted.
2459 * PARAMETERS:
2460 * bmp - pointer to bmap descriptor
2461 * blkno - the first block of a block range within a dmap. it is
2462 * the allocation or deallocation of this block range that
2463 * requires the dmap control page to be adjusted.
2464 * newval - the new value of the lower level dmap or dmap control
2465 * page root.
2466 * alloc - 'true' if adjustment is due to an allocation.
2467 * level - current level of dmap control page (i.e. L0, L1, L2) to
2468 * be adjusted.
2469 *
2470 * RETURN VALUES:
2471 * 0 - success
2472 * -EIO - i/o error
2473 *
2474 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2475 */
2476 static int
dbAdjCtl(struct bmap * bmp,s64 blkno,int newval,int alloc,int level)2477 dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2478 {
2479 struct metapage *mp;
2480 s8 oldroot;
2481 int oldval;
2482 s64 lblkno;
2483 struct dmapctl *dcp;
2484 int rc, leafno, ti;
2485
2486 /* get the buffer for the dmap control page for the specified
2487 * block number and control page level.
2488 */
2489 lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
2490 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
2491 if (mp == NULL)
2492 return -EIO;
2493 dcp = (struct dmapctl *) mp->data;
2494
2495 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
2496 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
2497 release_metapage(mp);
2498 return -EIO;
2499 }
2500
2501 /* determine the leaf number corresponding to the block and
2502 * the index within the dmap control tree.
2503 */
2504 leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
2505 ti = leafno + le32_to_cpu(dcp->leafidx);
2506
2507 /* save the current leaf value and the current root level (i.e.
2508 * maximum l2 free string described by this dmapctl).
2509 */
2510 oldval = dcp->stree[ti];
2511 oldroot = dcp->stree[ROOT];
2512
2513 /* check if this is a control page update for an allocation.
2514 * if so, update the leaf to reflect the new leaf value using
2515 * dbSplit(); otherwise (deallocation), use dbJoin() to update
2516 * the leaf with the new value. in addition to updating the
2517 * leaf, dbSplit() will also split the binary buddy system of
2518 * the leaves, if required, and bubble new values within the
2519 * dmapctl tree, if required. similarly, dbJoin() will join
2520 * the binary buddy system of leaves and bubble new values up
2521 * the dmapctl tree as required by the new leaf value.
2522 */
2523 if (alloc) {
2524 /* check if we are in the middle of a binary buddy
2525 * system. this happens when we are performing the
2526 * first allocation out of an allocation group that
2527 * is part (not the first part) of a larger binary
2528 * buddy system. if we are in the middle, back split
2529 * the system prior to calling dbSplit() which assumes
2530 * that it is at the front of a binary buddy system.
2531 */
2532 if (oldval == NOFREE) {
2533 rc = dbBackSplit((dmtree_t *)dcp, leafno, true);
2534 if (rc) {
2535 release_metapage(mp);
2536 return rc;
2537 }
2538 oldval = dcp->stree[ti];
2539 }
2540 dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval, true);
2541 } else {
2542 rc = dbJoin((dmtree_t *) dcp, leafno, newval, true);
2543 if (rc) {
2544 release_metapage(mp);
2545 return rc;
2546 }
2547 }
2548
2549 /* check if the root of the current dmap control page changed due
2550 * to the update and if the current dmap control page is not at
2551 * the current top level (i.e. L0, L1, L2) of the map. if so (i.e.
2552 * root changed and this is not the top level), call this routine
2553 * again (recursion) for the next higher level of the mapping to
2554 * reflect the change in root for the current dmap control page.
2555 */
2556 if (dcp->stree[ROOT] != oldroot) {
2557 /* are we below the top level of the map. if so,
2558 * bubble the root up to the next higher level.
2559 */
2560 if (level < bmp->db_maxlevel) {
2561 /* bubble up the new root of this dmap control page to
2562 * the next level.
2563 */
2564 if ((rc =
2565 dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
2566 level + 1))) {
2567 /* something went wrong in bubbling up the new
2568 * root value, so backout the changes to the
2569 * current dmap control page.
2570 */
2571 if (alloc) {
2572 dbJoin((dmtree_t *) dcp, leafno,
2573 oldval, true);
2574 } else {
2575 /* the dbJoin() above might have
2576 * caused a larger binary buddy system
2577 * to form and we may now be in the
2578 * middle of it. if this is the case,
2579 * back split the buddies.
2580 */
2581 if (dcp->stree[ti] == NOFREE)
2582 dbBackSplit((dmtree_t *)
2583 dcp, leafno, true);
2584 dbSplit((dmtree_t *) dcp, leafno,
2585 dcp->budmin, oldval, true);
2586 }
2587
2588 /* release the buffer and return the error.
2589 */
2590 release_metapage(mp);
2591 return (rc);
2592 }
2593 } else {
2594 /* we're at the top level of the map. update
2595 * the bmap control page to reflect the size
2596 * of the maximum free buddy system.
2597 */
2598 assert(level == bmp->db_maxlevel);
2599 if (bmp->db_maxfreebud != oldroot) {
2600 jfs_error(bmp->db_ipbmap->i_sb,
2601 "the maximum free buddy is not the old root\n");
2602 }
2603 bmp->db_maxfreebud = dcp->stree[ROOT];
2604 }
2605 }
2606
2607 /* write the buffer.
2608 */
2609 write_metapage(mp);
2610
2611 return (0);
2612 }
2613
2614
2615 /*
2616 * NAME: dbSplit()
2617 *
2618 * FUNCTION: update the leaf of a dmtree with a new value, splitting
2619 * the leaf from the binary buddy system of the dmtree's
2620 * leaves, as required.
2621 *
2622 * PARAMETERS:
2623 * tp - pointer to the tree containing the leaf.
2624 * leafno - the number of the leaf to be updated.
2625 * splitsz - the size the binary buddy system starting at the leaf
2626 * must be split to, specified as the log2 number of blocks.
2627 * newval - the new value for the leaf.
2628 *
2629 * RETURN VALUES: none
2630 *
2631 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2632 */
dbSplit(dmtree_t * tp,int leafno,int splitsz,int newval,bool is_ctl)2633 static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl)
2634 {
2635 int budsz;
2636 int cursz;
2637 s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2638
2639 /* check if the leaf needs to be split.
2640 */
2641 if (leaf[leafno] > tp->dmt_budmin) {
2642 /* the split occurs by cutting the buddy system in half
2643 * at the specified leaf until we reach the specified
2644 * size. pick up the starting split size (current size
2645 * - 1 in l2) and the corresponding buddy size.
2646 */
2647 cursz = leaf[leafno] - 1;
2648 budsz = BUDSIZE(cursz, tp->dmt_budmin);
2649
2650 /* split until we reach the specified size.
2651 */
2652 while (cursz >= splitsz) {
2653 /* update the buddy's leaf with its new value.
2654 */
2655 dbAdjTree(tp, leafno ^ budsz, cursz, is_ctl);
2656
2657 /* on to the next size and buddy.
2658 */
2659 cursz -= 1;
2660 budsz >>= 1;
2661 }
2662 }
2663
2664 /* adjust the dmap tree to reflect the specified leaf's new
2665 * value.
2666 */
2667 dbAdjTree(tp, leafno, newval, is_ctl);
2668 }
2669
2670
2671 /*
2672 * NAME: dbBackSplit()
2673 *
2674 * FUNCTION: back split the binary buddy system of dmtree leaves
2675 * that hold a specified leaf until the specified leaf
2676 * starts its own binary buddy system.
2677 *
2678 * the allocators typically perform allocations at the start
2679 * of binary buddy systems and dbSplit() is used to accomplish
2680 * any required splits. in some cases, however, allocation
2681 * may occur in the middle of a binary system and requires a
2682 * back split, with the split proceeding out from the middle of
2683 * the system (less efficient) rather than the start of the
2684 * system (more efficient). the cases in which a back split
2685 * is required are rare and are limited to the first allocation
2686 * within an allocation group which is a part (not first part)
2687 * of a larger binary buddy system and a few exception cases
2688 * in which a previous join operation must be backed out.
2689 *
2690 * PARAMETERS:
2691 * tp - pointer to the tree containing the leaf.
2692 * leafno - the number of the leaf to be updated.
2693 *
2694 * RETURN VALUES: none
2695 *
2696 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2697 */
dbBackSplit(dmtree_t * tp,int leafno,bool is_ctl)2698 static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl)
2699 {
2700 int budsz, bud, w, bsz, size;
2701 int cursz;
2702 s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2703
2704 /* leaf should be part (not first part) of a binary
2705 * buddy system.
2706 */
2707 assert(leaf[leafno] == NOFREE);
2708
2709 /* the back split is accomplished by iteratively finding the leaf
2710 * that starts the buddy system that contains the specified leaf and
2711 * splitting that system in two. this iteration continues until
2712 * the specified leaf becomes the start of a buddy system.
2713 *
2714 * determine maximum possible l2 size for the specified leaf.
2715 */
2716 size =
2717 LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
2718 tp->dmt_budmin);
2719
2720 /* determine the number of leaves covered by this size. this
2721 * is the buddy size that we will start with as we search for
2722 * the buddy system that contains the specified leaf.
2723 */
2724 budsz = BUDSIZE(size, tp->dmt_budmin);
2725
2726 /* back split.
2727 */
2728 while (leaf[leafno] == NOFREE) {
2729 /* find the leftmost buddy leaf.
2730 */
2731 for (w = leafno, bsz = budsz;; bsz <<= 1,
2732 w = (w < bud) ? w : bud) {
2733 if (bsz >= le32_to_cpu(tp->dmt_nleafs)) {
2734 jfs_err("JFS: block map error in dbBackSplit");
2735 return -EIO;
2736 }
2737
2738 /* determine the buddy.
2739 */
2740 bud = w ^ bsz;
2741
2742 /* check if this buddy is the start of the system.
2743 */
2744 if (leaf[bud] != NOFREE) {
2745 /* split the leaf at the start of the
2746 * system in two.
2747 */
2748 cursz = leaf[bud] - 1;
2749 dbSplit(tp, bud, cursz, cursz, is_ctl);
2750 break;
2751 }
2752 }
2753 }
2754
2755 if (leaf[leafno] != size) {
2756 jfs_err("JFS: wrong leaf value in dbBackSplit");
2757 return -EIO;
2758 }
2759 return 0;
2760 }
2761
2762
2763 /*
2764 * NAME: dbJoin()
2765 *
2766 * FUNCTION: update the leaf of a dmtree with a new value, joining
2767 * the leaf with other leaves of the dmtree into a multi-leaf
2768 * binary buddy system, as required.
2769 *
2770 * PARAMETERS:
2771 * tp - pointer to the tree containing the leaf.
2772 * leafno - the number of the leaf to be updated.
2773 * newval - the new value for the leaf.
2774 *
2775 * RETURN VALUES: none
2776 */
dbJoin(dmtree_t * tp,int leafno,int newval,bool is_ctl)2777 static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
2778 {
2779 int budsz, buddy;
2780 s8 *leaf;
2781
2782 /* can the new leaf value require a join with other leaves ?
2783 */
2784 if (newval >= tp->dmt_budmin) {
2785 /* pickup a pointer to the leaves of the tree.
2786 */
2787 leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2788
2789 /* try to join the specified leaf into a large binary
2790 * buddy system. the join proceeds by attempting to join
2791 * the specified leafno with its buddy (leaf) at new value.
2792 * if the join occurs, we attempt to join the left leaf
2793 * of the joined buddies with its buddy at new value + 1.
2794 * we continue to join until we find a buddy that cannot be
2795 * joined (does not have a value equal to the size of the
2796 * last join) or until all leaves have been joined into a
2797 * single system.
2798 *
2799 * get the buddy size (number of words covered) of
2800 * the new value.
2801 */
2802 budsz = BUDSIZE(newval, tp->dmt_budmin);
2803
2804 /* try to join.
2805 */
2806 while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
2807 /* get the buddy leaf.
2808 */
2809 buddy = leafno ^ budsz;
2810
2811 /* if the leaf's new value is greater than its
2812 * buddy's value, we join no more.
2813 */
2814 if (newval > leaf[buddy])
2815 break;
2816
2817 /* It shouldn't be less */
2818 if (newval < leaf[buddy])
2819 return -EIO;
2820
2821 /* check which (leafno or buddy) is the left buddy.
2822 * the left buddy gets to claim the blocks resulting
2823 * from the join while the right gets to claim none.
2824 * the left buddy is also eligible to participate in
2825 * a join at the next higher level while the right
2826 * is not.
2827 *
2828 */
2829 if (leafno < buddy) {
2830 /* leafno is the left buddy.
2831 */
2832 dbAdjTree(tp, buddy, NOFREE, is_ctl);
2833 } else {
2834 /* buddy is the left buddy and becomes
2835 * leafno.
2836 */
2837 dbAdjTree(tp, leafno, NOFREE, is_ctl);
2838 leafno = buddy;
2839 }
2840
2841 /* on to try the next join.
2842 */
2843 newval += 1;
2844 budsz <<= 1;
2845 }
2846 }
2847
2848 /* update the leaf value.
2849 */
2850 dbAdjTree(tp, leafno, newval, is_ctl);
2851
2852 return 0;
2853 }
2854
2855
2856 /*
2857 * NAME: dbAdjTree()
2858 *
2859 * FUNCTION: update a leaf of a dmtree with a new value, adjusting
2860 * the dmtree, as required, to reflect the new leaf value.
2861 * the combination of any buddies must already be done before
2862 * this is called.
2863 *
2864 * PARAMETERS:
2865 * tp - pointer to the tree to be adjusted.
2866 * leafno - the number of the leaf to be updated.
2867 * newval - the new value for the leaf.
2868 *
2869 * RETURN VALUES: none
2870 */
dbAdjTree(dmtree_t * tp,int leafno,int newval,bool is_ctl)2871 static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl)
2872 {
2873 int lp, pp, k;
2874 int max, size;
2875
2876 size = is_ctl ? CTLTREESIZE : TREESIZE;
2877
2878 /* pick up the index of the leaf for this leafno.
2879 */
2880 lp = leafno + le32_to_cpu(tp->dmt_leafidx);
2881
2882 if (WARN_ON_ONCE(lp >= size || lp < 0))
2883 return;
2884
2885 /* is the current value the same as the old value ? if so,
2886 * there is nothing to do.
2887 */
2888 if (tp->dmt_stree[lp] == newval)
2889 return;
2890
2891 /* set the new value.
2892 */
2893 tp->dmt_stree[lp] = newval;
2894
2895 /* bubble the new value up the tree as required.
2896 */
2897 for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
2898 if (lp == 0)
2899 break;
2900
2901 /* get the index of the first leaf of the 4 leaf
2902 * group containing the specified leaf (leafno).
2903 */
2904 lp = ((lp - 1) & ~0x03) + 1;
2905
2906 /* get the index of the parent of this 4 leaf group.
2907 */
2908 pp = (lp - 1) >> 2;
2909
2910 /* determine the maximum of the 4 leaves.
2911 */
2912 max = TREEMAX(&tp->dmt_stree[lp]);
2913
2914 /* if the maximum of the 4 is the same as the
2915 * parent's value, we're done.
2916 */
2917 if (tp->dmt_stree[pp] == max)
2918 break;
2919
2920 /* parent gets new value.
2921 */
2922 tp->dmt_stree[pp] = max;
2923
2924 /* parent becomes leaf for next go-round.
2925 */
2926 lp = pp;
2927 }
2928 }
2929
2930
2931 /*
2932 * NAME: dbFindLeaf()
2933 *
2934 * FUNCTION: search a dmtree_t for sufficient free blocks, returning
2935 * the index of a leaf describing the free blocks if
2936 * sufficient free blocks are found.
2937 *
2938 * the search starts at the top of the dmtree_t tree and
2939 * proceeds down the tree to the leftmost leaf with sufficient
2940 * free space.
2941 *
2942 * PARAMETERS:
2943 * tp - pointer to the tree to be searched.
2944 * l2nb - log2 number of free blocks to search for.
2945 * leafidx - return pointer to be set to the index of the leaf
2946 * describing at least l2nb free blocks if sufficient
2947 * free blocks are found.
2948 * is_ctl - determines if the tree is of type ctl
2949 *
2950 * RETURN VALUES:
2951 * 0 - success
2952 * -ENOSPC - insufficient free blocks.
2953 */
dbFindLeaf(dmtree_t * tp,int l2nb,int * leafidx,bool is_ctl)2954 static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl)
2955 {
2956 int ti, n = 0, k, x = 0;
2957 int max_size, max_idx;
2958
2959 max_size = is_ctl ? CTLTREESIZE : TREESIZE;
2960 max_idx = is_ctl ? LPERCTL : LPERDMAP;
2961
2962 /* first check the root of the tree to see if there is
2963 * sufficient free space.
2964 */
2965 if (l2nb > tp->dmt_stree[ROOT])
2966 return -ENOSPC;
2967
2968 /* sufficient free space available. now search down the tree
2969 * starting at the next level for the leftmost leaf that
2970 * describes sufficient free space.
2971 */
2972 for (k = le32_to_cpu(tp->dmt_height), ti = 1;
2973 k > 0; k--, ti = ((ti + n) << 2) + 1) {
2974 /* search the four nodes at this level, starting from
2975 * the left.
2976 */
2977 for (x = ti, n = 0; n < 4; n++) {
2978 /* sufficient free space found. move to the next
2979 * level (or quit if this is the last level).
2980 */
2981 if (x + n > max_size)
2982 return -ENOSPC;
2983 if (l2nb <= tp->dmt_stree[x + n])
2984 break;
2985 }
2986
2987 /* better have found something since the higher
2988 * levels of the tree said it was here.
2989 */
2990 assert(n < 4);
2991 }
2992 if (le32_to_cpu(tp->dmt_leafidx) >= max_idx)
2993 return -ENOSPC;
2994
2995 /* set the return to the leftmost leaf describing sufficient
2996 * free space.
2997 */
2998 *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
2999
3000 return (0);
3001 }
3002
3003
3004 /*
3005 * NAME: dbFindBits()
3006 *
3007 * FUNCTION: find a specified number of binary buddy free bits within a
3008 * dmap bitmap word value.
3009 *
3010 * this routine searches the bitmap value for (1 << l2nb) free
3011 * bits at (1 << l2nb) alignments within the value.
3012 *
3013 * PARAMETERS:
3014 * word - dmap bitmap word value.
3015 * l2nb - number of free bits specified as a log2 number.
3016 *
3017 * RETURN VALUES:
3018 * starting bit number of free bits.
3019 */
dbFindBits(u32 word,int l2nb)3020 static int dbFindBits(u32 word, int l2nb)
3021 {
3022 int bitno, nb;
3023 u32 mask;
3024
3025 /* get the number of bits.
3026 */
3027 nb = 1 << l2nb;
3028 assert(nb <= DBWORD);
3029
3030 /* complement the word so we can use a mask (i.e. 0s represent
3031 * free bits) and compute the mask.
3032 */
3033 word = ~word;
3034 mask = ONES << (DBWORD - nb);
3035
3036 /* scan the word for nb free bits at nb alignments.
3037 */
3038 for (bitno = 0; mask != 0; bitno += nb, mask = (mask >> nb)) {
3039 if ((mask & word) == mask)
3040 break;
3041 }
3042
3043 ASSERT(bitno < 32);
3044
3045 /* return the bit number.
3046 */
3047 return (bitno);
3048 }
3049
3050
3051 /*
3052 * NAME: dbMaxBud(u8 *cp)
3053 *
3054 * FUNCTION: determine the largest binary buddy string of free
3055 * bits within 32-bits of the map.
3056 *
3057 * PARAMETERS:
3058 * cp - pointer to the 32-bit value.
3059 *
3060 * RETURN VALUES:
3061 * largest binary buddy of free bits within a dmap word.
3062 */
dbMaxBud(u8 * cp)3063 static int dbMaxBud(u8 * cp)
3064 {
3065 signed char tmp1, tmp2;
3066
3067 /* check if the wmap word is all free. if so, the
3068 * free buddy size is BUDMIN.
3069 */
3070 if (*((uint *) cp) == 0)
3071 return (BUDMIN);
3072
3073 /* check if the wmap word is half free. if so, the
3074 * free buddy size is BUDMIN-1.
3075 */
3076 if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
3077 return (BUDMIN - 1);
3078
3079 /* not all free or half free. determine the free buddy
3080 * size thru table lookup using quarters of the wmap word.
3081 */
3082 tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
3083 tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
3084 return (max(tmp1, tmp2));
3085 }
3086
3087
3088 /*
3089 * NAME: cnttz(uint word)
3090 *
3091 * FUNCTION: determine the number of trailing zeros within a 32-bit
3092 * value.
3093 *
3094 * PARAMETERS:
3095 * value - 32-bit value to be examined.
3096 *
3097 * RETURN VALUES:
3098 * count of trailing zeros
3099 */
cnttz(u32 word)3100 static int cnttz(u32 word)
3101 {
3102 int n;
3103
3104 for (n = 0; n < 32; n++, word >>= 1) {
3105 if (word & 0x01)
3106 break;
3107 }
3108
3109 return (n);
3110 }
3111
3112
3113 /*
3114 * NAME: cntlz(u32 value)
3115 *
3116 * FUNCTION: determine the number of leading zeros within a 32-bit
3117 * value.
3118 *
3119 * PARAMETERS:
3120 * value - 32-bit value to be examined.
3121 *
3122 * RETURN VALUES:
3123 * count of leading zeros
3124 */
cntlz(u32 value)3125 static int cntlz(u32 value)
3126 {
3127 int n;
3128
3129 for (n = 0; n < 32; n++, value <<= 1) {
3130 if (value & HIGHORDER)
3131 break;
3132 }
3133 return (n);
3134 }
3135
3136
3137 /*
3138 * NAME: blkstol2(s64 nb)
3139 *
3140 * FUNCTION: convert a block count to its log2 value. if the block
3141 * count is not a l2 multiple, it is rounded up to the next
3142 * larger l2 multiple.
3143 *
3144 * PARAMETERS:
3145 * nb - number of blocks
3146 *
3147 * RETURN VALUES:
3148 * log2 number of blocks
3149 */
blkstol2(s64 nb)3150 static int blkstol2(s64 nb)
3151 {
3152 int l2nb;
3153 s64 mask; /* meant to be signed */
3154
3155 mask = (s64) 1 << (64 - 1);
3156
3157 /* count the leading bits.
3158 */
3159 for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
3160 /* leading bit found.
3161 */
3162 if (nb & mask) {
3163 /* determine the l2 value.
3164 */
3165 l2nb = (64 - 1) - l2nb;
3166
3167 /* check if we need to round up.
3168 */
3169 if (~mask & nb)
3170 l2nb++;
3171
3172 return (l2nb);
3173 }
3174 }
3175 assert(0);
3176 return 0; /* fix compiler warning */
3177 }
3178
3179
3180 /*
3181 * NAME: dbAllocBottomUp()
3182 *
3183 * FUNCTION: alloc the specified block range from the working block
3184 * allocation map.
3185 *
3186 * the blocks will be alloc from the working map one dmap
3187 * at a time.
3188 *
3189 * PARAMETERS:
3190 * ip - pointer to in-core inode;
3191 * blkno - starting block number to be freed.
3192 * nblocks - number of blocks to be freed.
3193 *
3194 * RETURN VALUES:
3195 * 0 - success
3196 * -EIO - i/o error
3197 */
dbAllocBottomUp(struct inode * ip,s64 blkno,s64 nblocks)3198 int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
3199 {
3200 struct metapage *mp;
3201 struct dmap *dp;
3202 int nb, rc;
3203 s64 lblkno, rem;
3204 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
3205 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
3206
3207 IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
3208
3209 /* block to be allocated better be within the mapsize. */
3210 ASSERT(nblocks <= bmp->db_mapsize - blkno);
3211
3212 /*
3213 * allocate the blocks a dmap at a time.
3214 */
3215 mp = NULL;
3216 for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
3217 /* release previous dmap if any */
3218 if (mp) {
3219 write_metapage(mp);
3220 }
3221
3222 /* get the buffer for the current dmap. */
3223 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
3224 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
3225 if (mp == NULL) {
3226 IREAD_UNLOCK(ipbmap);
3227 return -EIO;
3228 }
3229 dp = (struct dmap *) mp->data;
3230
3231 /* determine the number of blocks to be allocated from
3232 * this dmap.
3233 */
3234 nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
3235
3236 /* allocate the blocks. */
3237 if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
3238 release_metapage(mp);
3239 IREAD_UNLOCK(ipbmap);
3240 return (rc);
3241 }
3242 }
3243
3244 /* write the last buffer. */
3245 write_metapage(mp);
3246
3247 IREAD_UNLOCK(ipbmap);
3248
3249 return (0);
3250 }
3251
3252
dbAllocDmapBU(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)3253 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
3254 int nblocks)
3255 {
3256 int rc;
3257 int dbitno, word, rembits, nb, nwords, wbitno, agno;
3258 s8 oldroot;
3259 struct dmaptree *tp = (struct dmaptree *) & dp->tree;
3260
3261 /* save the current value of the root (i.e. maximum free string)
3262 * of the dmap tree.
3263 */
3264 oldroot = tp->stree[ROOT];
3265
3266 /* determine the bit number and word within the dmap of the
3267 * starting block.
3268 */
3269 dbitno = blkno & (BPERDMAP - 1);
3270 word = dbitno >> L2DBWORD;
3271
3272 /* block range better be within the dmap */
3273 assert(dbitno + nblocks <= BPERDMAP);
3274
3275 /* allocate the bits of the dmap's words corresponding to the block
3276 * range. not all bits of the first and last words may be contained
3277 * within the block range. if this is the case, we'll work against
3278 * those words (i.e. partial first and/or last) on an individual basis
3279 * (a single pass), allocating the bits of interest by hand and
3280 * updating the leaf corresponding to the dmap word. a single pass
3281 * will be used for all dmap words fully contained within the
3282 * specified range. within this pass, the bits of all fully contained
3283 * dmap words will be marked as free in a single shot and the leaves
3284 * will be updated. a single leaf may describe the free space of
3285 * multiple dmap words, so we may update only a subset of the actual
3286 * leaves corresponding to the dmap words of the block range.
3287 */
3288 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
3289 /* determine the bit number within the word and
3290 * the number of bits within the word.
3291 */
3292 wbitno = dbitno & (DBWORD - 1);
3293 nb = min(rembits, DBWORD - wbitno);
3294
3295 /* check if only part of a word is to be allocated.
3296 */
3297 if (nb < DBWORD) {
3298 /* allocate (set to 1) the appropriate bits within
3299 * this dmap word.
3300 */
3301 dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
3302 >> wbitno);
3303
3304 word++;
3305 } else {
3306 /* one or more dmap words are fully contained
3307 * within the block range. determine how many
3308 * words and allocate (set to 1) the bits of these
3309 * words.
3310 */
3311 nwords = rembits >> L2DBWORD;
3312 memset(&dp->wmap[word], (int) ONES, nwords * 4);
3313
3314 /* determine how many bits */
3315 nb = nwords << L2DBWORD;
3316 word += nwords;
3317 }
3318 }
3319
3320 /* update the free count for this dmap */
3321 le32_add_cpu(&dp->nfree, -nblocks);
3322
3323 /* reconstruct summary tree */
3324 dbInitDmapTree(dp);
3325
3326 BMAP_LOCK(bmp);
3327
3328 /* if this allocation group is completely free,
3329 * update the highest active allocation group number
3330 * if this allocation group is the new max.
3331 */
3332 agno = blkno >> bmp->db_agl2size;
3333 if (agno > bmp->db_maxag)
3334 bmp->db_maxag = agno;
3335
3336 /* update the free count for the allocation group and map */
3337 bmp->db_agfree[agno] -= nblocks;
3338 bmp->db_nfree -= nblocks;
3339
3340 BMAP_UNLOCK(bmp);
3341
3342 /* if the root has not changed, done. */
3343 if (tp->stree[ROOT] == oldroot)
3344 return (0);
3345
3346 /* root changed. bubble the change up to the dmap control pages.
3347 * if the adjustment of the upper level control pages fails,
3348 * backout the bit allocation (thus making everything consistent).
3349 */
3350 if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
3351 dbFreeBits(bmp, dp, blkno, nblocks);
3352
3353 return (rc);
3354 }
3355
3356
3357 /*
3358 * NAME: dbExtendFS()
3359 *
3360 * FUNCTION: extend bmap from blkno for nblocks;
3361 * dbExtendFS() updates bmap ready for dbAllocBottomUp();
3362 *
3363 * L2
3364 * |
3365 * L1---------------------------------L1
3366 * | |
3367 * L0---------L0---------L0 L0---------L0---------L0
3368 * | | | | | |
3369 * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm;
3370 * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
3371 *
3372 * <---old---><----------------------------extend----------------------->
3373 */
dbExtendFS(struct inode * ipbmap,s64 blkno,s64 nblocks)3374 int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3375 {
3376 struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
3377 int nbperpage = sbi->nbperpage;
3378 int i, i0 = true, j, j0 = true, k, n;
3379 s64 newsize;
3380 s64 p;
3381 struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL;
3382 struct dmapctl *l2dcp, *l1dcp, *l0dcp;
3383 struct dmap *dp;
3384 s8 *l0leaf, *l1leaf, *l2leaf;
3385 struct bmap *bmp = sbi->bmap;
3386 int agno, l2agsize, oldl2agsize;
3387 s64 ag_rem;
3388
3389 newsize = blkno + nblocks;
3390
3391 jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld",
3392 (long long) blkno, (long long) nblocks, (long long) newsize);
3393
3394 /*
3395 * initialize bmap control page.
3396 *
3397 * all the data in bmap control page should exclude
3398 * the mkfs hidden dmap page.
3399 */
3400
3401 /* update mapsize */
3402 bmp->db_mapsize = newsize;
3403 bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
3404
3405 /* compute new AG size */
3406 l2agsize = dbGetL2AGSize(newsize);
3407 oldl2agsize = bmp->db_agl2size;
3408
3409 bmp->db_agl2size = l2agsize;
3410 bmp->db_agsize = (s64)1 << l2agsize;
3411
3412 /* compute new number of AG */
3413 agno = bmp->db_numag;
3414 bmp->db_numag = newsize >> l2agsize;
3415 bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
3416
3417 /*
3418 * reconfigure db_agfree[]
3419 * from old AG configuration to new AG configuration;
3420 *
3421 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
3422 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
3423 * note: new AG size = old AG size * (2**x).
3424 */
3425 if (l2agsize == oldl2agsize)
3426 goto extend;
3427 k = 1 << (l2agsize - oldl2agsize);
3428 ag_rem = bmp->db_agfree[0]; /* save agfree[0] */
3429 for (i = 0, n = 0; i < agno; n++) {
3430 bmp->db_agfree[n] = 0; /* init collection point */
3431
3432 /* coalesce contiguous k AGs; */
3433 for (j = 0; j < k && i < agno; j++, i++) {
3434 /* merge AGi to AGn */
3435 bmp->db_agfree[n] += bmp->db_agfree[i];
3436 }
3437 }
3438 bmp->db_agfree[0] += ag_rem; /* restore agfree[0] */
3439
3440 for (; n < MAXAG; n++)
3441 bmp->db_agfree[n] = 0;
3442
3443 /*
3444 * update highest active ag number
3445 */
3446
3447 bmp->db_maxag = bmp->db_maxag / k;
3448
3449 /*
3450 * extend bmap
3451 *
3452 * update bit maps and corresponding level control pages;
3453 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
3454 */
3455 extend:
3456 /* get L2 page */
3457 p = BMAPBLKNO + nbperpage; /* L2 page */
3458 l2mp = read_metapage(ipbmap, p, PSIZE, 0);
3459 if (!l2mp) {
3460 jfs_error(ipbmap->i_sb, "L2 page could not be read\n");
3461 return -EIO;
3462 }
3463 l2dcp = (struct dmapctl *) l2mp->data;
3464
3465 /* compute start L1 */
3466 k = blkno >> L2MAXL1SIZE;
3467 l2leaf = l2dcp->stree + CTLLEAFIND + k;
3468 p = BLKTOL1(blkno, sbi->l2nbperpage); /* L1 page */
3469
3470 /*
3471 * extend each L1 in L2
3472 */
3473 for (; k < LPERCTL; k++, p += nbperpage) {
3474 /* get L1 page */
3475 if (j0) {
3476 /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
3477 l1mp = read_metapage(ipbmap, p, PSIZE, 0);
3478 if (l1mp == NULL)
3479 goto errout;
3480 l1dcp = (struct dmapctl *) l1mp->data;
3481
3482 /* compute start L0 */
3483 j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
3484 l1leaf = l1dcp->stree + CTLLEAFIND + j;
3485 p = BLKTOL0(blkno, sbi->l2nbperpage);
3486 j0 = false;
3487 } else {
3488 /* assign/init L1 page */
3489 l1mp = get_metapage(ipbmap, p, PSIZE, 0);
3490 if (l1mp == NULL)
3491 goto errout;
3492
3493 l1dcp = (struct dmapctl *) l1mp->data;
3494
3495 /* compute start L0 */
3496 j = 0;
3497 l1leaf = l1dcp->stree + CTLLEAFIND;
3498 p += nbperpage; /* 1st L0 of L1.k */
3499 }
3500
3501 /*
3502 * extend each L0 in L1
3503 */
3504 for (; j < LPERCTL; j++) {
3505 /* get L0 page */
3506 if (i0) {
3507 /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
3508
3509 l0mp = read_metapage(ipbmap, p, PSIZE, 0);
3510 if (l0mp == NULL)
3511 goto errout;
3512 l0dcp = (struct dmapctl *) l0mp->data;
3513
3514 /* compute start dmap */
3515 i = (blkno & (MAXL0SIZE - 1)) >>
3516 L2BPERDMAP;
3517 l0leaf = l0dcp->stree + CTLLEAFIND + i;
3518 p = BLKTODMAP(blkno,
3519 sbi->l2nbperpage);
3520 i0 = false;
3521 } else {
3522 /* assign/init L0 page */
3523 l0mp = get_metapage(ipbmap, p, PSIZE, 0);
3524 if (l0mp == NULL)
3525 goto errout;
3526
3527 l0dcp = (struct dmapctl *) l0mp->data;
3528
3529 /* compute start dmap */
3530 i = 0;
3531 l0leaf = l0dcp->stree + CTLLEAFIND;
3532 p += nbperpage; /* 1st dmap of L0.j */
3533 }
3534
3535 /*
3536 * extend each dmap in L0
3537 */
3538 for (; i < LPERCTL; i++) {
3539 /*
3540 * reconstruct the dmap page, and
3541 * initialize corresponding parent L0 leaf
3542 */
3543 if ((n = blkno & (BPERDMAP - 1))) {
3544 /* read in dmap page: */
3545 mp = read_metapage(ipbmap, p,
3546 PSIZE, 0);
3547 if (mp == NULL)
3548 goto errout;
3549 n = min(nblocks, (s64)BPERDMAP - n);
3550 } else {
3551 /* assign/init dmap page */
3552 mp = read_metapage(ipbmap, p,
3553 PSIZE, 0);
3554 if (mp == NULL)
3555 goto errout;
3556
3557 n = min_t(s64, nblocks, BPERDMAP);
3558 }
3559
3560 dp = (struct dmap *) mp->data;
3561 *l0leaf = dbInitDmap(dp, blkno, n);
3562
3563 bmp->db_nfree += n;
3564 agno = le64_to_cpu(dp->start) >> l2agsize;
3565 bmp->db_agfree[agno] += n;
3566
3567 write_metapage(mp);
3568
3569 l0leaf++;
3570 p += nbperpage;
3571
3572 blkno += n;
3573 nblocks -= n;
3574 if (nblocks == 0)
3575 break;
3576 } /* for each dmap in a L0 */
3577
3578 /*
3579 * build current L0 page from its leaves, and
3580 * initialize corresponding parent L1 leaf
3581 */
3582 *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
3583 write_metapage(l0mp);
3584 l0mp = NULL;
3585
3586 if (nblocks)
3587 l1leaf++; /* continue for next L0 */
3588 else {
3589 /* more than 1 L0 ? */
3590 if (j > 0)
3591 break; /* build L1 page */
3592 else {
3593 /* summarize in global bmap page */
3594 bmp->db_maxfreebud = *l1leaf;
3595 release_metapage(l1mp);
3596 release_metapage(l2mp);
3597 goto finalize;
3598 }
3599 }
3600 } /* for each L0 in a L1 */
3601
3602 /*
3603 * build current L1 page from its leaves, and
3604 * initialize corresponding parent L2 leaf
3605 */
3606 *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
3607 write_metapage(l1mp);
3608 l1mp = NULL;
3609
3610 if (nblocks)
3611 l2leaf++; /* continue for next L1 */
3612 else {
3613 /* more than 1 L1 ? */
3614 if (k > 0)
3615 break; /* build L2 page */
3616 else {
3617 /* summarize in global bmap page */
3618 bmp->db_maxfreebud = *l2leaf;
3619 release_metapage(l2mp);
3620 goto finalize;
3621 }
3622 }
3623 } /* for each L1 in a L2 */
3624
3625 jfs_error(ipbmap->i_sb, "function has not returned as expected\n");
3626 errout:
3627 if (l0mp)
3628 release_metapage(l0mp);
3629 if (l1mp)
3630 release_metapage(l1mp);
3631 release_metapage(l2mp);
3632 return -EIO;
3633
3634 /*
3635 * finalize bmap control page
3636 */
3637 finalize:
3638
3639 return 0;
3640 }
3641
3642
3643 /*
3644 * dbFinalizeBmap()
3645 */
dbFinalizeBmap(struct inode * ipbmap)3646 void dbFinalizeBmap(struct inode *ipbmap)
3647 {
3648 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
3649 int actags, inactags, l2nl;
3650 s64 ag_rem, actfree, inactfree, avgfree;
3651 int i, n;
3652
3653 /*
3654 * finalize bmap control page
3655 */
3656 //finalize:
3657 /*
3658 * compute db_agpref: preferred ag to allocate from
3659 * (the leftmost ag with average free space in it);
3660 */
3661 //agpref:
3662 /* get the number of active ags and inactive ags */
3663 actags = bmp->db_maxag + 1;
3664 inactags = bmp->db_numag - actags;
3665 ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */
3666
3667 /* determine how many blocks are in the inactive allocation
3668 * groups. in doing this, we must account for the fact that
3669 * the rightmost group might be a partial group (i.e. file
3670 * system size is not a multiple of the group size).
3671 */
3672 inactfree = (inactags && ag_rem) ?
3673 (((s64)inactags - 1) << bmp->db_agl2size) + ag_rem
3674 : ((s64)inactags << bmp->db_agl2size);
3675
3676 /* determine how many free blocks are in the active
3677 * allocation groups plus the average number of free blocks
3678 * within the active ags.
3679 */
3680 actfree = bmp->db_nfree - inactfree;
3681 avgfree = (u32) actfree / (u32) actags;
3682
3683 /* if the preferred allocation group has not average free space.
3684 * re-establish the preferred group as the leftmost
3685 * group with average free space.
3686 */
3687 if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
3688 for (bmp->db_agpref = 0; bmp->db_agpref < actags;
3689 bmp->db_agpref++) {
3690 if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
3691 break;
3692 }
3693 if (bmp->db_agpref >= bmp->db_numag) {
3694 jfs_error(ipbmap->i_sb,
3695 "cannot find ag with average freespace\n");
3696 }
3697 }
3698
3699 /*
3700 * compute db_aglevel, db_agheight, db_width, db_agstart:
3701 * an ag is covered in aglevel dmapctl summary tree,
3702 * at agheight level height (from leaf) with agwidth number of nodes
3703 * each, which starts at agstart index node of the smmary tree node
3704 * array;
3705 */
3706 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
3707 l2nl =
3708 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
3709 bmp->db_agheight = l2nl >> 1;
3710 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
3711 for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
3712 i--) {
3713 bmp->db_agstart += n;
3714 n <<= 2;
3715 }
3716
3717 }
3718
3719
3720 /*
3721 * NAME: dbInitDmap()/ujfs_idmap_page()
3722 *
3723 * FUNCTION: initialize working/persistent bitmap of the dmap page
3724 * for the specified number of blocks:
3725 *
3726 * at entry, the bitmaps had been initialized as free (ZEROS);
3727 * The number of blocks will only account for the actually
3728 * existing blocks. Blocks which don't actually exist in
3729 * the aggregate will be marked as allocated (ONES);
3730 *
3731 * PARAMETERS:
3732 * dp - pointer to page of map
3733 * nblocks - number of blocks this page
3734 *
3735 * RETURNS: NONE
3736 */
dbInitDmap(struct dmap * dp,s64 Blkno,int nblocks)3737 static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
3738 {
3739 int blkno, w, b, r, nw, nb, i;
3740
3741 /* starting block number within the dmap */
3742 blkno = Blkno & (BPERDMAP - 1);
3743
3744 if (blkno == 0) {
3745 dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
3746 dp->start = cpu_to_le64(Blkno);
3747
3748 if (nblocks == BPERDMAP) {
3749 memset(&dp->wmap[0], 0, LPERDMAP * 4);
3750 memset(&dp->pmap[0], 0, LPERDMAP * 4);
3751 goto initTree;
3752 }
3753 } else {
3754 le32_add_cpu(&dp->nblocks, nblocks);
3755 le32_add_cpu(&dp->nfree, nblocks);
3756 }
3757
3758 /* word number containing start block number */
3759 w = blkno >> L2DBWORD;
3760
3761 /*
3762 * free the bits corresponding to the block range (ZEROS):
3763 * note: not all bits of the first and last words may be contained
3764 * within the block range.
3765 */
3766 for (r = nblocks; r > 0; r -= nb, blkno += nb) {
3767 /* number of bits preceding range to be freed in the word */
3768 b = blkno & (DBWORD - 1);
3769 /* number of bits to free in the word */
3770 nb = min(r, DBWORD - b);
3771
3772 /* is partial word to be freed ? */
3773 if (nb < DBWORD) {
3774 /* free (set to 0) from the bitmap word */
3775 dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
3776 >> b));
3777 dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
3778 >> b));
3779
3780 /* skip the word freed */
3781 w++;
3782 } else {
3783 /* free (set to 0) contiguous bitmap words */
3784 nw = r >> L2DBWORD;
3785 memset(&dp->wmap[w], 0, nw * 4);
3786 memset(&dp->pmap[w], 0, nw * 4);
3787
3788 /* skip the words freed */
3789 nb = nw << L2DBWORD;
3790 w += nw;
3791 }
3792 }
3793
3794 /*
3795 * mark bits following the range to be freed (non-existing
3796 * blocks) as allocated (ONES)
3797 */
3798
3799 if (blkno == BPERDMAP)
3800 goto initTree;
3801
3802 /* the first word beyond the end of existing blocks */
3803 w = blkno >> L2DBWORD;
3804
3805 /* does nblocks fall on a 32-bit boundary ? */
3806 b = blkno & (DBWORD - 1);
3807 if (b) {
3808 /* mark a partial word allocated */
3809 dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
3810 w++;
3811 }
3812
3813 /* set the rest of the words in the page to allocated (ONES) */
3814 for (i = w; i < LPERDMAP; i++)
3815 dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES);
3816
3817 /*
3818 * init tree
3819 */
3820 initTree:
3821 return (dbInitDmapTree(dp));
3822 }
3823
3824
3825 /*
3826 * NAME: dbInitDmapTree()/ujfs_complete_dmap()
3827 *
3828 * FUNCTION: initialize summary tree of the specified dmap:
3829 *
3830 * at entry, bitmap of the dmap has been initialized;
3831 *
3832 * PARAMETERS:
3833 * dp - dmap to complete
3834 * blkno - starting block number for this dmap
3835 * treemax - will be filled in with max free for this dmap
3836 *
3837 * RETURNS: max free string at the root of the tree
3838 */
dbInitDmapTree(struct dmap * dp)3839 static int dbInitDmapTree(struct dmap * dp)
3840 {
3841 struct dmaptree *tp;
3842 s8 *cp;
3843 int i;
3844
3845 /* init fixed info of tree */
3846 tp = &dp->tree;
3847 tp->nleafs = cpu_to_le32(LPERDMAP);
3848 tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
3849 tp->leafidx = cpu_to_le32(LEAFIND);
3850 tp->height = cpu_to_le32(4);
3851 tp->budmin = BUDMIN;
3852
3853 /* init each leaf from corresponding wmap word:
3854 * note: leaf is set to NOFREE(-1) if all blocks of corresponding
3855 * bitmap word are allocated.
3856 */
3857 cp = tp->stree + le32_to_cpu(tp->leafidx);
3858 for (i = 0; i < LPERDMAP; i++)
3859 *cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
3860
3861 /* build the dmap's binary buddy summary tree */
3862 return (dbInitTree(tp));
3863 }
3864
3865
3866 /*
3867 * NAME: dbInitTree()/ujfs_adjtree()
3868 *
3869 * FUNCTION: initialize binary buddy summary tree of a dmap or dmapctl.
3870 *
3871 * at entry, the leaves of the tree has been initialized
3872 * from corresponding bitmap word or root of summary tree
3873 * of the child control page;
3874 * configure binary buddy system at the leaf level, then
3875 * bubble up the values of the leaf nodes up the tree.
3876 *
3877 * PARAMETERS:
3878 * cp - Pointer to the root of the tree
3879 * l2leaves- Number of leaf nodes as a power of 2
3880 * l2min - Number of blocks that can be covered by a leaf
3881 * as a power of 2
3882 *
3883 * RETURNS: max free string at the root of the tree
3884 */
dbInitTree(struct dmaptree * dtp)3885 static int dbInitTree(struct dmaptree * dtp)
3886 {
3887 int l2max, l2free, bsize, nextb, i;
3888 int child, parent, nparent;
3889 s8 *tp, *cp, *cp1;
3890
3891 tp = dtp->stree;
3892
3893 /* Determine the maximum free string possible for the leaves */
3894 l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
3895
3896 /*
3897 * configure the leaf level into binary buddy system
3898 *
3899 * Try to combine buddies starting with a buddy size of 1
3900 * (i.e. two leaves). At a buddy size of 1 two buddy leaves
3901 * can be combined if both buddies have a maximum free of l2min;
3902 * the combination will result in the left-most buddy leaf having
3903 * a maximum free of l2min+1.
3904 * After processing all buddies for a given size, process buddies
3905 * at the next higher buddy size (i.e. current size * 2) and
3906 * the next maximum free (current free + 1).
3907 * This continues until the maximum possible buddy combination
3908 * yields maximum free.
3909 */
3910 for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
3911 l2free++, bsize = nextb) {
3912 /* get next buddy size == current buddy pair size */
3913 nextb = bsize << 1;
3914
3915 /* scan each adjacent buddy pair at current buddy size */
3916 for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
3917 i < le32_to_cpu(dtp->nleafs);
3918 i += nextb, cp += nextb) {
3919 /* coalesce if both adjacent buddies are max free */
3920 if (*cp == l2free && *(cp + bsize) == l2free) {
3921 *cp = l2free + 1; /* left take right */
3922 *(cp + bsize) = -1; /* right give left */
3923 }
3924 }
3925 }
3926
3927 /*
3928 * bubble summary information of leaves up the tree.
3929 *
3930 * Starting at the leaf node level, the four nodes described by
3931 * the higher level parent node are compared for a maximum free and
3932 * this maximum becomes the value of the parent node.
3933 * when all lower level nodes are processed in this fashion then
3934 * move up to the next level (parent becomes a lower level node) and
3935 * continue the process for that level.
3936 */
3937 for (child = le32_to_cpu(dtp->leafidx),
3938 nparent = le32_to_cpu(dtp->nleafs) >> 2;
3939 nparent > 0; nparent >>= 2, child = parent) {
3940 /* get index of 1st node of parent level */
3941 parent = (child - 1) >> 2;
3942
3943 /* set the value of the parent node as the maximum
3944 * of the four nodes of the current level.
3945 */
3946 for (i = 0, cp = tp + child, cp1 = tp + parent;
3947 i < nparent; i++, cp += 4, cp1++)
3948 *cp1 = TREEMAX(cp);
3949 }
3950
3951 return (*tp);
3952 }
3953
3954
3955 /*
3956 * dbInitDmapCtl()
3957 *
3958 * function: initialize dmapctl page
3959 */
dbInitDmapCtl(struct dmapctl * dcp,int level,int i)3960 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
3961 { /* start leaf index not covered by range */
3962 s8 *cp;
3963
3964 dcp->nleafs = cpu_to_le32(LPERCTL);
3965 dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
3966 dcp->leafidx = cpu_to_le32(CTLLEAFIND);
3967 dcp->height = cpu_to_le32(5);
3968 dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
3969
3970 /*
3971 * initialize the leaves of current level that were not covered
3972 * by the specified input block range (i.e. the leaves have no
3973 * low level dmapctl or dmap).
3974 */
3975 cp = &dcp->stree[CTLLEAFIND + i];
3976 for (; i < LPERCTL; i++)
3977 *cp++ = NOFREE;
3978
3979 /* build the dmap's binary buddy summary tree */
3980 return (dbInitTree((struct dmaptree *) dcp));
3981 }
3982
3983
3984 /*
3985 * NAME: dbGetL2AGSize()/ujfs_getagl2size()
3986 *
3987 * FUNCTION: Determine log2(allocation group size) from aggregate size
3988 *
3989 * PARAMETERS:
3990 * nblocks - Number of blocks in aggregate
3991 *
3992 * RETURNS: log2(allocation group size) in aggregate blocks
3993 */
dbGetL2AGSize(s64 nblocks)3994 static int dbGetL2AGSize(s64 nblocks)
3995 {
3996 s64 sz;
3997 s64 m;
3998 int l2sz;
3999
4000 if (nblocks < BPERDMAP * MAXAG)
4001 return (L2BPERDMAP);
4002
4003 /* round up aggregate size to power of 2 */
4004 m = ((u64) 1 << (64 - 1));
4005 for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
4006 if (m & nblocks)
4007 break;
4008 }
4009
4010 sz = (s64) 1 << l2sz;
4011 if (sz < nblocks)
4012 l2sz += 1;
4013
4014 /* agsize = roundupSize/max_number_of_ag */
4015 return (l2sz - L2MAXAG);
4016 }
4017
4018
4019 /*
4020 * NAME: dbMapFileSizeToMapSize()
4021 *
4022 * FUNCTION: compute number of blocks the block allocation map file
4023 * can cover from the map file size;
4024 *
4025 * RETURNS: Number of blocks which can be covered by this block map file;
4026 */
4027
4028 /*
4029 * maximum number of map pages at each level including control pages
4030 */
4031 #define MAXL0PAGES (1 + LPERCTL)
4032 #define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES)
4033
4034 /*
4035 * convert number of map pages to the zero origin top dmapctl level
4036 */
4037 #define BMAPPGTOLEV(npages) \
4038 (((npages) <= 3 + MAXL0PAGES) ? 0 : \
4039 ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
4040
dbMapFileSizeToMapSize(struct inode * ipbmap)4041 s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
4042 {
4043 struct super_block *sb = ipbmap->i_sb;
4044 s64 nblocks;
4045 s64 npages, ndmaps;
4046 int level, i;
4047 int complete, factor;
4048
4049 nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
4050 npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
4051 level = BMAPPGTOLEV(npages);
4052
4053 /* At each level, accumulate the number of dmap pages covered by
4054 * the number of full child levels below it;
4055 * repeat for the last incomplete child level.
4056 */
4057 ndmaps = 0;
4058 npages--; /* skip the first global control page */
4059 /* skip higher level control pages above top level covered by map */
4060 npages -= (2 - level);
4061 npages--; /* skip top level's control page */
4062 for (i = level; i >= 0; i--) {
4063 factor =
4064 (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
4065 complete = (u32) npages / factor;
4066 ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL :
4067 ((i == 1) ? LPERCTL : 1));
4068
4069 /* pages in last/incomplete child */
4070 npages = (u32) npages % factor;
4071 /* skip incomplete child's level control page */
4072 npages--;
4073 }
4074
4075 /* convert the number of dmaps into the number of blocks
4076 * which can be covered by the dmaps;
4077 */
4078 nblocks = ndmaps << L2BPERDMAP;
4079
4080 return (nblocks);
4081 }
4082