4 * Copyright (C) 2005-2009
5 * Neil Brown <neilb@suse.de>
6 * Released under the GPL, version 2
8 * The stateblock and superblocks are copied into the
9 * internal 'struct fs' at startup time, and are written
10 * out based on information there-in.
11 * The first stateblock read has sufficient information
12 * to size all the components. These are only changed
13 * in unusual circumstances (Adding devices, taking snapshots).
17 #define HASHSIZE (1<<HASHBITS)
18 #define HASHMASK (HASHSIZE-1)
20 /* for segment hash */
22 #define SHASHSIZE (1<<SHASHBITS)
23 #define SHASHMASK (SHASHSIZE-1)
27 #define QHASHSIZE (1<<QHASHBITS)
28 #define QHASHMASK (QHASHSIZE-1)
30 /* skip points are used to accelerate sequential-order insert
31 * in the list of blocks pending write
33 #define SKIP_MAX_HEIGHT 8
36 struct skippoint *next[SKIP_MAX_HEIGHT];
39 #define WC_NUM 3 /* 3 active write-clusters: new, clean, and defrag */
40 #define CLEANER_SEGS 4 /* Clean at most 4 segments at a time */
42 #define ACCOUNT_RESERVED 3 /* Reserve 3 segments of space for accounting and
45 #define RELEASE_RESERVED 1 /* Reserve 1 segment of space for overhead required
46 * to release space (e.g. delete file)
48 #define TOTAL_RESERVED (ACCOUNT_RESERVED + RELEASE_RESERVED)
50 struct lafs_state *state;
57 int blocksize, blocksize_bits;
60 atomic_t sb_writes_pending;
61 wait_queue_head_t sb_writes_wait;
62 wait_queue_head_t trunc_wait;
63 wait_queue_head_t async_complete;
65 struct super_block *prime_sb;
68 unsigned short nonlog_dev;
72 u64 checkpointcluster;
79 /* general purpose lock. Protects:
80 * - pending_orphans list
84 struct inode *orphans; /* the orphans file */
86 struct list_head pending_orphans;
89 int phase; /* 0 or 1 */
90 wait_queue_head_t phase_wait; /* Also use to wait for first_free_pass */
92 /* flags to set on next cluster. */
94 int rolled; /* set when rollforward has completed */
95 unsigned long fsstate;
96 #define CheckpointNeeded 0
97 #define ThreadRunning 1
98 #define ThreadNeeded 2
99 #define FinalCheckpoint 3
100 #define CleanerDisabled 4
101 #define OrphansRunning 5
102 #define CleanerBlocks 6 /* One or more threads is blocked waiting for the
103 * cleaner to progress - cleaner.need blocks are
106 #define EmergencyClean 7/* Cleaner is in emergency mode and will clean
107 * the segment with the most space no matter
108 * how young it is. The fs is considered to
109 * be full and new allocation requests get
112 #define FlushNeeded 8 /* Need to flush the current cluster because
113 * someone is waiting on writeback
115 #define SecondFlushNeeded 9 /* Need a second cluster to commit the blocks
116 * in the previous one
118 #define EmergencyPending 10 /* Cleaner isn't quite in emergency mode, but
119 * should be after the next checkpoint unless that
120 * releases lots of space
122 #define CheckpointOpen 11 /* Some data has been written since the last checkpoint,
123 * so 'next_checkpoint' is a valid timestamp
125 #define DelayYouth 12 /* While set, don't update any youth blocks. The update will
126 * happen either during seg_apply_all or in roll-forward
129 unsigned long next_checkpoint; /* Valid when CheckpointOpen is set, holds
130 * jiffie time by when we need to do a checkpoint
133 struct work_struct done_work; /* used for handling
134 * refile after write completes */
137 int active; /* number of actively cleaned segments */
138 u32 cleaning; /* amount of space that is being cleaned
141 u32 need; /* Amount of space that is needed by
142 * Some thread waiting on CleanerBlocks
145 struct mutex lock; /* protects list mani and refcnt of core
151 u64 haddr; /* Address of this write-cluster-header */
152 u64 seq; /* seq of most recent header loaded */
153 struct cluster_head *ch;
154 struct group_head *gh; /* current group head */
155 struct descriptor *desc;
156 struct async_complete {
157 int state; /* 0=ignore, 1=valid, 2=loading,
158 3=loaded, 4 = ioerror */
161 int have_addr; /* true if dev/seg are valid */
162 struct list_head cleaning;
166 struct task_struct *thread;
168 unsigned long newsegments; /* number of segments written since checkpoint */
169 unsigned long max_newsegs; /* max segments in a checkpoint (roughly) */
171 /* counters for (pre)allocating space. */
172 spinlock_t alloc_lock;
173 s64 free_blocks; /* initialised from free segment info */
174 u64 allocated_blocks; /* Blocks that have been (pre)allocated */
175 u64 clean_reserved; /* Blocks reserved for cleaner segments */
176 u64 max_segment; /* largest segment size */
177 u64 total_free; /* free space found in all segments this scan */
178 u64 total_free_prev; /* " " " in previous scan */
179 int cluster_updates; /* DEBUGGING */
181 #define ReleaseSpace 1
183 #define AccountSpace 3
185 struct list_head inode_index; /* list of inode index_blocks,
186 * largely so they can look hashed
189 struct list_head phase_leafs[2]; /* list of pinned blocks that
190 * have no pinned children,
194 struct list_head clean_leafs; /* list of pinned blocks that
195 * have no pinned children
196 * and are being cleaned
198 struct list_head account_leafs; /* list of accounting block
199 * that we need to write after
200 * the checkpoint is done.
201 * They are now pinned to the
205 /* Youth management */
206 int youth_next; /* number to assign to next segment */
207 int checkpoint_youth;
209 /* 10 heights: 0 to 9 */
210 #define SEG_NUM_HEIGHTS (10)
213 int size[4]; /* entry size in page as "number of skip pointers" */
215 unsigned short first, last;
217 } unused, free, cleanable, clean;
218 unsigned short head[SEG_NUM_HEIGHTS]; /* head of skiplist */
225 * NOTE: there should probably be a free list for each 'level'
230 int free_dev, free_block, free_stage;
231 int first_free_pass; /* true the first time */
232 int done, do_decay; /* cleared on each checkpoint */
233 struct datablock *youth_db, *usage0_db;
234 u32 *free_usages; /* This is an allocated page */
239 int qphase; /* like phase, but switches later in qflush */
240 struct list_head flush_list; /* list of qents that need flushing */
241 struct list_head qhash[QHASHSIZE];
243 struct backing_dev_info bdi;
245 struct block_device *bdev;
246 struct lafs_dev *devblk;
259 u32 rows_per_table, tables_per_seg;
261 int recent_dev, recent_state;
263 int tablesize; /* in segusage file, not in segments */
264 struct inode *segsum;
268 /* A 'write-cluster' descriptor
269 * Any write-cluster that we are in the process of
270 * building has this structure to describe it.
271 * We gather dirty blocks and keep track of cluster-header usage
273 struct skippoint slhead; /* skiplist of pending blocks */
274 struct list_head clhead; /* list of pending blocks (linked
279 struct page *page[4];/* a page which the cluster-head is built
280 * in - never in high-memory */
281 struct cluster_head *chead; /* the cluster head == page */
282 int chead_blocks; /* blocks allocated for cluster head */
283 int cluster_space; /* space remaining in cluster_head
284 * after current commitments
286 int chead_size; /* space used already in cluster_head
287 * new miniblock and allocation
290 int remaining; /* available space in segment */
292 unsigned long long cluster_seq; /* seq number of next cluster*/
293 u64 prev_addr; /* where previous cluster is */
295 int cnum; /* Which wc entry this is - needed for
296 * finding the 'fs' from a 'wc' */
298 /* We keep track of how many blocks are still in-flight
299 * for each cluster. This allows us to know when data is
301 * The count included all blocks in the cluster - including
302 * those in the header - plus all those in the header of
303 * the next cluster, and possibly the one after (in the case
305 * This implies that we need to track as many as 4 separate
306 * write cluster. The one that is currently being built,
307 * and the previous 3. We use a ring buffer with a pointer
308 * to the entry what is currently being built.
310 atomic_t pending_cnt[4];
311 int pending_vfy_type[4];
312 struct list_head pending_blocks[4];
314 wait_queue_head_t pending_wait;
316 struct bio *bio; /* bio we are building */
317 u64 bio_virt; /* next sector we can add to bio */
318 int bio_head; /* True if current bio contains a header block */
319 int bio_which; /* pending[X] entry for this bio */
320 struct request_queue *bio_queue; /* queue to unplug */
326 u32 st_table, st_row;
327 u32 nxt_table, nxt_row;
333 struct hlist_head stable[SHASHSIZE];
334 spinlock_t stable_lock;
335 int stable_changed; /* Flag so we can see if the table changed while
336 * we dropped a lock */
339 struct rename_roll *next;
341 struct inode *dir, *inode;
348 static inline int test_phase_locked(struct fs *fs)
350 return fs->phase_locked > 0;
353 /* There are two sorts of blocks, data blocks and index blocks.
354 * Data blocks store the contents of files, including inode files.
355 * So each inode is in a datablock.
356 * Index blocks store index/indirect/extent blocks. As inodes often
357 * contain index information, Inode will usually have an index block aswell.
359 * Data blocks are stored in an address space and are indexed by inode and offset.
360 * Index blocks are indexed by inode, depth, and offset. The "depth" is
361 * distance from data, so indirect and extent blocks have depth of 1.
362 * This index doesn't often change.
364 * All blocks have a pointer to their parent, which is an index block (except
366 * All index blocks have a linked list of children which is used to find children
367 * to move when the block is split.
368 * There are also lists of children with pending changes to incorporate.
370 * A datablock for an inode points to the struct inode.
371 * The struct inode points to the index block for the inode
372 * The index block points to the datablock through the "parent" pointer.
377 unsigned long fileaddr;
382 struct indexblock *parent;
383 struct list_head siblings; /* Next unreachable block in the same
384 * reachability-set as this block
387 struct list_head lru; /* phase_leafs, clean_leafs, account_leafs,
388 * clhead, pending_blocks */
390 struct list_head peers; /* other blocks that use the same location
391 * in storage (i.e. in another snapshot)
394 struct block *chain; /* on list of unincorporated changes, or list of blocks
398 struct ref {int cnt; char *name; } holders[16];
409 u32 orphan_slot; /* slot in orphan file to record that this
413 /* If a block is both an orphan and undergoing
414 * cleaning, it lives on the cleaning list until
415 * the cleaner has checked it. It is then moved
416 * to the pending_orphans list.
418 struct list_head orphans; /* linked list of blocks needing orphan
421 struct list_head cleaning; /* list of blocks being cleaned.
425 struct inode *my_inode; /* only valid for block holding an inode */
431 struct hlist_node hash;
434 struct list_head children;
437 * pincnt[p] is the number of pinned blocks in phase 'p' which
438 * have us as their ->parent.
442 /* Addresses that need to be incorporated in the current
443 * phase are stored in an extent table until incorporation
444 * which gets scheduled when the table is full.
445 * Index block are kept on an list.
446 * Blocks in the next phase are kept on a list until
447 * the phase moves past and incorporation can be considered.
450 int credits; /* Number of uninc credits still available */
451 int pending_cnt; /* number of valid entries in pending_addr */
452 /* The number of extents in pending_addr must be at most
453 * half the number of extents that can fit into an extent
454 * block. At 512 bytes, 512/12 == 42. So now more than
455 * 21. See do_incorporate_leaf
463 /* If this is internal (depth>1) the actual block are kept on this list*/
466 /* these are all in the next phase. On phase-change,
467 * we copy all the addresses into the uninc_table
468 * and free the blocks
470 struct block *uninc_next;
473 #define iblk(__bl) container_of(__bl, struct indexblock, b)
474 #define dblk(__bl) container_of(__bl, struct datablock, b)
477 /* NOTE: 32 flags in used. Need to overlap 'data' with 'index' if
480 /* First flags that are meaningful for both Data and Index blocks
482 B_Phase1 = 0, /* phase when pinned - must be '1' - used for indexing */
483 B_Dirty, /* block has changes which haven't been committed */
484 B_Index, /* This is an index block, not a data block */
485 B_Linked, /* block is known to be linked with all peers */
487 B_Realloc, /* If set on a B_Dirty block, it was only dirtied for
488 * cleaning purposes and so should be written to the
491 B_Valid, /* block contains valid data */
492 B_Uninc, /* on ->unincorporated list for next phase, so we
493 * cannot free it just yet.*/
494 B_InoIdx, /* is the index block for an inode */
495 B_Pinned, /* B_Phase1 is meaningful and block must be written
496 * (if dirty) in this phase
498 B_Async, /* An async IO is pending on this block */
499 B_Root, /* data or index block for root. ->parent is NULL */
501 B_SegRef, /* holds a reference on segment of physaddr */
502 B_Credit, /* data block with space preallocated */
503 B_ICredit, /* index space preallocated for data block */
504 B_NCredit, /* Credit for next phase */
506 B_NICredit, /* ICredit for next phase */
507 B_UnincCredit, /* Uninc carries a credit */
508 B_IOLock, /* Block is undergoing IO */
509 B_Writeback, /* Block is in a cluster */
510 B_IOLockLock, /* lock while testing IOLock on a page */
512 B_WriteError, /* Got a write error while writing (when else?) */
513 B_PhysValid, /* ->physaddr is correct */
515 /* Flags that are only relevant for Data Block
518 B_PinPending, /* set on data blocks while checkpoint_locked if we might
519 * want to mark them dirty
521 B_Prealloc, /* This data block has physaddr==0 and has been counted
522 * in various ablock counters.
524 B_Orphan, /* ->orphan_slot is valid */
526 B_HaveLock, /* We own the page lock and when all blocks are unlocked,
527 * the page should be unlocked */
528 B_HaveWriteback,/* We own the page writeback flag and when all blocks
529 * are unlocked we should clear it. */
530 B_Claimed, /* Used for exclusive allocation of inodes */
531 B_Cleaning, /* Used by cleaner to track which blocks it has a
532 * reference on already. */
534 /* Flags that are only relevant for Index Blocks
536 B_OnFree, /* index block on the free list */
537 B_PrimaryRef, /* This indexblock has not been incorporated into the
538 * parent, and so can only be found by being a sibling
539 * of a 'primary' which is incorporated. Consequently
540 * a counted reference is held on ->siblings.prev,
541 * which is either the primary or another block holding
544 B_EmptyIndex, /* Index block is empty, has no (non-EmptyIndex) children,
545 * will be destroyed soon. Indexing must avoid it
546 * unless if the first child of parent.
547 * Gets allocated to '0'.
548 * Flag only set under B_IOLock.
553 /* indexing info stays in the block, not in the inode */
555 struct inode vfs_inode;
556 struct inode *filesys; /* Inode of containing TypeInodeFile */
557 struct indexblock *iblock;
558 struct datablock *dblock;
559 long cblocks, /* data blocks which are commited to this file */
560 pblocks, /* extra data blocks that are commited in next phase */
561 ablocks, /* data blocks that are allocated */
563 ciblocks, /* index blocks commited */
564 piblocks; /* index blocks in next phase */
565 unsigned long iflags;
566 #define I_Phase1 1 /* set to 'this' phase when iblocks correct */
568 #define I_Deleting 3 /* preserve the inode while truncate-on-delete happens */
569 #define I_Destroyed 4 /* inode destroy has been delayed */
570 #define I_Trunc 5 /* a truncation is in process */
571 #define I_Pinned 6 /* InoIdx is pinned, i_nlink is non-zero, and consequently
572 * we own an extra ref to the inode and superblock.
574 #define I_AccessTime 7 /* Set if this inode holds a reference to the block
575 * in the accesstime file that holds our atime
577 /* next three indicate if we hold a reference on the relevant qent */
582 struct rw_semaphore ind_sem;
583 unsigned long long update_cluster;
585 /* int generation; */
592 struct list_head free_index;
594 loff_t trunc_next; /* next block to truncate from */
597 int usagetable; /* 0 and unused for subsets,
598 * 1 for main fs, >1 for snapshots
600 u64 cblocks_used; /* blocks commited */
601 u64 pblocks_used; /* extra blocks in next phase */
602 u64 ablocks_used; /* extra blocks allocated */
604 u64 blocks_unalloc; /* FIXME what is this for ?? */
607 u32 parent; /* like 'parent' in directory */
610 struct inode *quota_inodes[3];
611 struct inode *accesstime;
616 /* These are state use to manage inum allocation */
618 int nextbit; /* bit in nextblock-1 to check next */
621 u32 table_size; /* (blocks) */
624 u16 atime_offset; /* value stored in atime file */
631 /* u64 modifytime; */
633 /* u64 accesstime; */
634 struct timespec i_accesstime; /* as stored in inode */
638 /* for directories */
646 u32 gracetime; /* multiplier for graceunits */
647 u32 graceunits; /* seconds per unit */
650 /* We free inodes using rcu, by which time any
651 * metadata is irrelevant and the type is zero
657 static inline struct lafs_inode *LAFSI(struct inode *in)
659 return container_of(in, struct lafs_inode, vfs_inode);