4 * Copyright (C) 2005-2009
5 * Neil Brown <neilb@suse.de>
6 * Released under the GPL, version 2
8 * The stateblock and superblocks are copied into the
9 * internal 'struct fs' at startup time, and are written
10 * out based on information there-in.
11 * The first stateblock read has sufficient information
12 * to size all the components. These are only changed
13 * in unusual circumstances (Adding devices, taking snapshots).
17 #define HASHSIZE (1<<HASHBITS)
18 #define HASHMASK (HASHSIZE-1)
20 /* for segment hash */
22 #define SHASHSIZE (1<<SHASHBITS)
23 #define SHASHMASK (SHASHSIZE-1)
27 #define QHASHSIZE (1<<QHASHBITS)
28 #define QHASHMASK (QHASHSIZE-1)
30 /* skip points are used to accelerate sequential-order insert
31 * in the list of blocks pending write
33 #define SKIP_MAX_HEIGHT 8
36 struct skippoint *next[SKIP_MAX_HEIGHT];
39 #define WC_NUM 3 /* 3 active write-clusters: new, clean, and defrag */
40 #define CLEANER_SEGS 4 /* Clean at most 4 segments at a time */
42 #define ACCOUNT_RESERVED 3 /* Reserve 3 segments of space for accounting and
45 #define RELEASE_RESERVED 1 /* Reserve 1 segment of space for overhead required
46 * to release space (e.g. delete file)
48 #define TOTAL_RESERVED (ACCOUNT_RESERVED + RELEASE_RESERVED)
50 struct lafs_state *state;
57 int blocksize, blocksize_bits;
60 atomic_t sb_writes_pending;
61 wait_queue_head_t sb_writes_wait;
62 wait_queue_head_t trunc_wait;
63 wait_queue_head_t async_complete;
65 struct super_block *prime_sb;
68 unsigned short nonlog_dev;
72 u64 checkpointcluster;
79 /* general purpose lock. Protects:
80 * - pending_orphans list
84 struct inode *orphans; /* the orphans file */
86 struct list_head pending_orphans;
89 int phase; /* 0 or 1 */
90 wait_queue_head_t phase_wait; /* Also use to wait for first_free_pass */
92 /* flags to set on next cluster. */
94 int rolled; /* set when rollforward has completed */
95 unsigned long fsstate;
96 #define CheckpointNeeded 0
97 #define ThreadRunning 1
98 #define ThreadNeeded 2
99 #define FinalCheckpoint 3
100 #define CleanerDisabled 4
101 #define OrphansRunning 5
102 #define CleanerBlocks 6 /* One or more threads is blocked waiting for the
103 * cleaner to progress - cleaner.need blocks are
106 #define EmergencyClean 7/* Cleaner is in emergency mode and will clean
107 * the segment with the most space no matter
108 * how young it is. The fs is considered to
109 * be full and new allocation requests get
112 #define FlushNeeded 8 /* Need to flush the current cluster because
113 * someone is waiting on writeback
115 #define SecondFlushNeeded 9 /* Need a second cluster to commit the blocks
116 * in the previous one
118 #define EmergencyPending 10 /* Cleaner isn't quite in emergency mode, but
119 * should be after the next checkpoint unless that
120 * releases lots of space
122 #define CheckpointOpen 11 /* Some data has been written since the last checkpoint,
123 * so 'next_checkpoint' is a valid timestamp
125 #define DelayYouth 12 /* While set, don't update any youth blocks. The update will
126 * happen either during seg_apply_all or in roll-forward
129 unsigned long next_checkpoint; /* Valid when CheckpointOpen is set, holds
130 * jiffie time by when we need to do a checkpoint
133 struct work_struct done_work; /* used for handling
134 * refile after write completes */
137 int active; /* number of actively cleaned segments */
138 u32 cleaning; /* amount of space that is being cleaned
141 u32 need; /* Amount of space that is needed by
142 * Some thread waiting on CleanerBlocks
145 struct mutex lock; /* protects list mani and refcnt of core
151 u64 haddr; /* Address of this write-cluster-header */
152 u64 seq; /* seq of most recent header loaded */
153 struct cluster_head *ch;
154 struct group_head *gh; /* current group head */
155 struct descriptor *desc;
156 struct async_complete {
157 int state; /* 0=ignore, 1=valid, 2=loading,
158 3=loaded, 4 = ioerror */
161 int have_addr; /* true if dev/seg are valid */
162 struct list_head cleaning;
166 struct task_struct *thread;
168 unsigned long newsegments; /* number of segments written since checkpoint */
169 unsigned long max_newsegs; /* max segments in a checkpoint (roughly) */
171 /* counters for (pre)allocating space. */
172 spinlock_t alloc_lock;
173 s64 free_blocks; /* initialised from free segment info */
174 u64 allocated_blocks; /* Blocks that have been (pre)allocated */
175 u64 clean_reserved; /* Blocks reserved for cleaner segments */
176 u64 max_segment; /* largest segment size */
177 u64 total_free; /* free space found in all segments this scan */
178 u64 total_free_prev; /* " " " in previous scan */
179 int cluster_updates; /* DEBUGGING */
181 #define ReleaseSpace 1
183 #define AccountSpace 3
185 struct list_head inode_index; /* list of inode index_blocks,
186 * largely so they can look hashed
189 struct list_head phase_leafs[2]; /* list of pinned blocks that
190 * have no pinned children,
194 struct list_head clean_leafs; /* list of pinned blocks that
195 * have no pinned children
196 * and are being cleaned
198 struct list_head account_leafs; /* list of accounting block
199 * that we need to write after
200 * the checkpoint is done.
201 * They are now pinned to the
205 /* Youth management */
206 int youth_next; /* number to assign to next segment */
207 int checkpoint_youth;
209 /* 10 heights: 0 to 9 */
210 #define SEG_NUM_HEIGHTS (10)
213 int size[4]; /* entry size in page as "number of skip pointers" */
215 unsigned short first, last;
217 } unused, free, cleanable, clean;
218 unsigned short head[SEG_NUM_HEIGHTS]; /* head of skiplist */
225 * NOTE: there should probably be a free list for each 'level'
230 int free_dev, free_block, free_stage;
231 int first_free_pass; /* true the first time */
232 int done, do_decay; /* cleared on each checkpoint */
233 struct datablock *youth_db, *usage0_db;
234 u16 *free_usages; /* This is an allocated page */
239 int qphase; /* like phase, but switches later in qflush */
240 struct list_head flush_list; /* list of qents that need flushing */
241 struct list_head qhash[QHASHSIZE];
243 struct backing_dev_info bdi;
245 struct block_device *bdev;
246 struct lafs_dev *devblk;
260 u32 rows_per_table, tables_per_seg;
262 int recent_dev, recent_state;
264 int tablesize; /* in segusage file, not in segments */
265 struct inode *segsum;
269 /* A 'write-cluster' descriptor
270 * Any write-cluster that we are in the process of
271 * building has this structure to describe it.
272 * We gather dirty blocks and keep track of cluster-header usage
274 struct skippoint slhead; /* skiplist of pending blocks */
275 struct list_head clhead; /* list of pending blocks (linked
280 struct page *page[4];/* a page which the cluster-head is built
281 * in - never in high-memory */
282 struct cluster_head *chead; /* the cluster head == page */
283 int chead_blocks; /* blocks allocated for cluster head */
284 int cluster_space; /* space remaining in cluster_head
285 * after current commitments
287 int chead_size; /* space used already in cluster_head
288 * new miniblock and allocation
291 int remaining; /* available space in segment */
293 unsigned long long cluster_seq; /* seq number of next cluster*/
294 u64 prev_addr; /* where previous cluster is */
296 int cnum; /* Which wc entry this is - needed for
297 * finding the 'fs' from a 'wc' */
299 /* We keep track of how many blocks are still in-flight
300 * for each cluster. This allows us to know when data is
302 * The count included all blocks in the cluster - including
303 * those in the header - plus all those in the header of
304 * the next cluster, and possibly the one after (in the case
306 * This implies that we need to track as many as 4 separate
307 * write cluster. The one that is currently being built,
308 * and the previous 3. We use a ring buffer with a pointer
309 * to the entry what is currently being built.
311 atomic_t pending_cnt[4];
312 int pending_vfy_type[4];
313 struct list_head pending_blocks[4];
315 wait_queue_head_t pending_wait;
317 struct bio *bio; /* bio we are building */
318 u64 bio_virt; /* next sector we can add to bio */
319 int bio_head; /* True if current bio contains a header block */
320 int bio_which; /* pending[X] entry for this bio */
321 struct request_queue *bio_queue; /* queue to unplug */
327 u32 st_table, st_row;
328 u32 nxt_table, nxt_row;
334 struct hlist_head stable[SHASHSIZE];
335 spinlock_t stable_lock;
336 int stable_changed; /* Flag so we can see if the table changed while
337 * we dropped a lock */
340 struct rename_roll *next;
342 struct inode *dir, *inode;
349 static inline int test_phase_locked(struct fs *fs)
351 return fs->phase_locked > 0;
354 /* There are two sorts of blocks, data blocks and index blocks.
355 * Data blocks store the contents of files, including inode files.
356 * So each inode is in a datablock.
357 * Index blocks store index/indirect/extent blocks. As inodes often
358 * contain index information, Inode will usually have an index block aswell.
360 * Data blocks are stored in an address space and are indexed by inode and offset.
361 * Index blocks are indexed by inode, depth, and offset. The "depth" is
362 * distance from data, so indirect and extent blocks have depth of 1.
363 * This index doesn't often change.
365 * All blocks have a pointer to their parent, which is an index block (except
367 * All index blocks have a linked list of children which is used to find children
368 * to move when the block is split.
369 * There are also lists of children with pending changes to incorporate.
371 * A datablock for an inode points to the struct inode.
372 * The struct inode points to the index block for the inode
373 * The index block points to the datablock through the "parent" pointer.
378 unsigned long fileaddr;
383 struct indexblock *parent;
384 struct list_head siblings; /* Next unreachable block in the same
385 * reachability-set as this block
388 struct list_head lru; /* phase_leafs, clean_leafs, account_leafs,
389 * clhead, pending_blocks */
391 struct list_head peers; /* other blocks that use the same location
392 * in storage (i.e. in another snapshot)
395 struct block *chain; /* on list of unincorporated changes, or list of blocks
399 struct ref {int cnt; char *name; } holders[16];
410 u32 orphan_slot; /* slot in orphan file to record that this
414 /* If a block is both an orphan and undergoing
415 * cleaning, it lives on the cleaning list until
416 * the cleaner has checked it. It is then moved
417 * to the pending_orphans list.
419 struct list_head orphans; /* linked list of blocks needing orphan
422 struct list_head cleaning; /* list of blocks being cleaned.
426 struct inode *my_inode; /* only valid for block holding an inode */
432 struct hlist_node hash;
435 struct list_head children;
438 * pincnt[p] is the number of pinned blocks in phase 'p' which
439 * have us as their ->parent.
443 /* Addresses that need to be incorporated in the current
444 * phase are stored in an extent table until incorporation
445 * which gets scheduled when the table is full.
446 * Index block are kept on an list.
447 * Blocks in the next phase are kept on a list until
448 * the phase moves past and incorporation can be considered.
451 int credits; /* Number of uninc credits still available */
452 int pending_cnt; /* number of valid entries in pending_addr */
453 /* The number of extents in pending_addr must be at most
454 * half the number of extents that can fit into an extent
455 * block. At 512 bytes, 512/12 == 42. So now more than
456 * 21. See do_incorporate_leaf
464 /* If this is internal (depth>1) the actual block are kept on this list*/
467 /* these are all in the next phase. On phase-change,
468 * we copy all the addresses into the uninc_table
469 * and free the blocks
471 struct block *uninc_next;
474 #define iblk(__bl) container_of(__bl, struct indexblock, b)
475 #define dblk(__bl) container_of(__bl, struct datablock, b)
478 /* NOTE: 32 flags in used. Need to overlap 'data' with 'index' if
481 /* First flags that are meaningful for both Data and Index blocks
483 B_Phase1 = 0, /* phase when pinned - must be '1' - used for indexing */
484 B_Dirty, /* block has changes which haven't been committed */
485 B_Index, /* This is an index block, not a data block */
486 B_Linked, /* block is known to be linked with all peers */
488 B_Realloc, /* If set on a B_Dirty block, it was only dirtied for
489 * cleaning purposes and so should be written to the
492 B_Valid, /* block contains valid data */
493 B_Uninc, /* on ->unincorporated list for next phase, so we
494 * cannot free it just yet.*/
495 B_InoIdx, /* is the index block for an inode */
496 B_Pinned, /* B_Phase1 is meaningful and block must be written
497 * (if dirty) in this phase
499 B_Async, /* An async IO is pending on this block */
500 B_Root, /* data or index block for root. ->parent is NULL */
502 B_SegRef, /* holds a reference on segment of physaddr */
503 B_Credit, /* data block with space preallocated */
504 B_ICredit, /* index space preallocated for data block */
505 B_NCredit, /* Credit for next phase */
507 B_NICredit, /* ICredit for next phase */
508 B_UnincCredit, /* Uninc carries a credit */
509 B_IOLock, /* Block is undergoing IO */
510 B_Writeback, /* Block is in a cluster */
511 B_IOLockLock, /* lock while testing IOLock on a page */
513 B_WriteError, /* Got a write error while writing (when else?) */
514 B_PhysValid, /* ->physaddr is correct */
516 /* Flags that are only relevant for Data Block
519 B_PinPending, /* set on data blocks while checkpoint_locked if we might
520 * want to mark them dirty
522 B_Prealloc, /* This data block has physaddr==0 and has been counted
523 * in various ablock counters.
525 B_Orphan, /* ->orphan_slot is valid */
527 B_HaveLock, /* We own the page lock and when all blocks are unlocked,
528 * the page should be unlocked */
529 B_HaveWriteback,/* We own the page writeback flag and when all blocks
530 * are unlocked we should clear it. */
531 B_Claimed, /* Used for exclusive allocation of inodes */
532 B_Cleaning, /* Used by cleaner to track which blocks it has a
533 * reference on already. */
535 /* Flags that are only relevant for Index Blocks
537 B_OnFree, /* index block on the free list */
538 B_PrimaryRef, /* This indexblock has not been incorporated into the
539 * parent, and so can only be found by being a sibling
540 * of a 'primary' which is incorporated. Consequently
541 * a counted reference is held on ->siblings.prev,
542 * which is either the primary or another block holding
545 B_EmptyIndex, /* Index block is empty, has no (non-EmptyIndex) children,
546 * will be destroyed soon. Indexing must avoid it
547 * unless if the first child of parent.
548 * Gets allocated to '0'.
549 * Flag only set under B_IOLock.
554 /* indexing info stays in the block, not in the inode */
556 struct inode vfs_inode;
557 struct indexblock *iblock;
558 struct datablock *dblock;
559 long cblocks, /* data blocks which are commited to this file */
560 pblocks, /* extra data blocks that are commited in next phase */
561 ablocks, /* data blocks that are allocated */
563 ciblocks, /* index blocks commited */
564 piblocks; /* index blocks in next phase */
565 unsigned long iflags;
566 #define I_Phase1 1 /* set to 'this' phase when iblocks correct */
568 #define I_Deleting 3 /* preserve the inode while truncate-on-delete happens */
569 #define I_Destroyed 4 /* inode destroy has been delayed */
570 #define I_Trunc 5 /* a truncation is in process */
571 #define I_Pinned 6 /* InoIdx is pinned, i_nlink is non-zero, and consequently
572 * we own an extra ref to the inode and superblock.
574 /* next three indicate if we hold a reference on the relevant qent */
579 struct rw_semaphore ind_sem;
580 unsigned long long update_cluster;
582 /* int generation; */
589 struct list_head free_index;
591 loff_t trunc_next; /* next block to truncate from */
594 int usagetable; /* 0 and unused for subsets,
595 * 1 for main fs, >1 for snapshots
597 u64 cblocks_used; /* blocks commited */
598 u64 pblocks_used; /* extra blocks in next phase */
599 u64 ablocks_used; /* extra blocks allocated */
601 u64 blocks_unalloc; /* FIXME what is this for ?? */
606 struct inode *quota_inodes[3];
611 /* These are state use to manage inum allocation */
613 int nextbit; /* bit in nextblock-1 to check next */
616 u32 table_size; /* (blocks) */
625 /* u64 modifytime; */
627 /* u64 accesstime; */
628 struct timespec i_accesstime; /* as stored in inode */
632 /* for directories */
640 u32 gracetime; /* multiplier for graceunits */
641 u32 graceunits; /* seconds per unit */
644 /* We free inodes using rcu, by which time any
645 * metadata is irrelevant and the type is zero
651 static inline struct lafs_inode *LAFSI(struct inode *in)
653 return container_of(in, struct lafs_inode, vfs_inode);