Start implementation of separate access-time file.

author NeilBrown <neilb@suse.de>

Fri, 4 Mar 2011 23:44:02 +0000 (10:44 +1100)

committer NeilBrown <neilb@suse.de>

Fri, 4 Mar 2011 23:44:02 +0000 (10:44 +1100)
author NeilBrown <neilb@suse.de>
Fri, 4 Mar 2011 23:44:02 +0000 (10:44 +1100)
committer NeilBrown <neilb@suse.de>
Fri, 4 Mar 2011 23:44:02 +0000 (10:44 +1100)
diff --git a/README b/README

index 6c3d454ebff43bc952ee8a1ed22fb8e395c0987c..21e73365e605b14deba8334e9fce1664f3a6d802 100644 (file)
--- a/README
+++ b/README
@@ -7041,3 +7041,113 @@ WritePhase - what is that all about?
     We should allow one 'page' for each metadatum, which probably meanss
     32K.
     So we should allow all state blocks to be near the start.
+
+01mar2011 - Autumn arrives.
+
+  Time to add handling of 'atime' and non-logged files.
+
+  The idea is to have a separate file for storing only 'atime'
+  This is separate from the inode file because the volatility of the data
+  is very different and one of the principle of log-structured-fs is that
+  differently volatile should be kept separate.
+
+  This does mean that an inode lookup requires getting data from two files,
+  but it is hopped that the 'atime' file will mostly be in cache as each
+  block contains the atime for lots of different inodes.
+
+  The atime file contains 2 bytes for each inode, so with a block size of 4K,
+  each block would hold info for 2048 inodes.  1 million inodes would require
+  2 megabytes.
+
+  The 16bits are treated as a positive floating point number which
+  gets added to the atime stored in the inode.  The lower 5 bits are
+  the exponent, the remaining 11 bits are mantissa.  Though there is a
+  little complexity in interpreting the exponent.
+     If the exponent is 0, the mantissa is and used as milliseconds -
+       so shift left 5 and multiply by 1000000 for nanoseconds.
+       The smallest change that can be recorded in 1 millisecond.
+       and values up to (2^11-1) milliseconds - or 2seconds can be stored.
+     If the exponent is 1 to 10, the mantissa has a '1' appended as a
+       new msb, and is shifted by the exponent-1 and then treated as milliseconds.
+       This ranges up to 2^(12+9) milliseconds or 30 minutes, where
+       the granularity will be 2^9 millisecs or 0.5 seconds
+
+  
+     For exponents from 11 up to 31 we add the 1 msb and treat
+       the number as seconds after shifting (e-11).  So at e==31,
+       we shift a number that is
+       up to 4095 by 20 to get nearly 2^32 seconds or 136 years.
+       At this point the granularity is 2^20 seconds or 12 days.
+
+
+   So overall we can update the atime for 136 years without needing to
+   update the inode, and can record differences of 1msec for the first
+   couple of seconds, then gradually less granularity until we are
+   down to one second an hour after the last change, and 4 hours a
+   year later.
+
+   To convert a number of seconds to this format:
+
+   If >= 2048 seconds, we shift down until less than 4096 seconds
+   counting the shift.  We add 11 to that number to form exponent,
+   and shift the resulting mantissa up 5, or with exponent, and mask
+   out bit 16.
+
+   Otherwise we convert to milliseconds (divide nanno by 1000000 and
+   multiply seconds by 1000, and add). Then if < 2048, we shift up by
+   5 leaving a zero exponent and use that.
+
+   Otherwise we shift down until < 4096 counting shifts, add 1 to the
+   shift to form an exponent, and combine with mantissa as above.
+   
+
+   So that is the format - how do we implement it?
+
+   We don't want to expose to user-space numbers that we cannot store.
+   So any 'utimes' call updates the inode directly can clear the value
+   in the atime file.  Only updates due to accesses go to the atimes
+   file.
+   We define a 'getattr' function which looks at the atime stored in
+   the vfs inode and if it has changed we need to deal with it.
+    - if the inode is still dirty we simply update the lafs inode
+      and use the number as-is, clearing the atimes entry
+    - else we subtract the stored atime from the new atime.  If this
+      is negative or exceeds 136 years we mark the inode dirty and
+      store it there.  It we cannot mark the inode dirty for some
+      reason we just store all 1s in the atime file.
+
+    The same operation is needed when dirty_inode is called to make
+    sure atime updates get saved even when no getattr is called.
+
+    As we always need to be able to update the atime file, it needs to
+    be permanently pinned whenever an inode is read in.  For
+    non-logged files this should be cheap but we must do it anyway as
+    the file might not be non-logged.
+    So we need to keep a permanent reference to each block while the
+    inode is loaded.  That can keep it pinned.
+
+
+    We don't want updates to the atime file to be flushed in any great
+    hurry, especially if it is a logged file.  We would be quite happy
+    to only write at 'unmount' and probably 'sync'.
+    So we want to stop the pages from appearing dirty in the page
+    cache (PAGECACHE_TAG_DIRTY), and the inode from appearing dirty
+    (I_DIRTY).
+    We can still keep them dirty in lafs metadata so if release_page
+    is called we can schedule a write out then.
+    
+
+   So some steps:
+
+    1/ load atime file at mount time - there is one for each
+      filesystem.  It has inum of 3 and type of TypeAccesstime (6).
+      Also release it on unmount.
+
+    2/ loading an inode must take a ref to the block in the atime file
+      if it exists.  A new inode flag records if this has happened.
+      Unless mounted noatime, we pin the block and reserve space.
+
+    3/ getattr and dirty_inode must resolve any issues with the
+       atime.  So lafs_inode probably needs an extra field to be able
+       to check for changes
+
diff --git a/inode.c b/inode.c

index 52659c38f5147c8d6c1c9384b7f8a5bc7db9bd0e..fd8ab773db85e3d42a3f8a9204b8eee73be332cf 100644 (file)
--- a/inode.c
+++ b/inode.c
@@ -304,6 +304,7 @@ lafs_import_inode(struct inode *ino, struct datablock *b)
                         = i->quota_inodes[2] = NULL;
                 nlen = li->metadata_size - offsetof(struct la_inode,
                                                     metadata[0].fs.name);
+               i->accesstime = NULL;
                 if (i->name)
                         kfree(i->name);
                 if (nlen == 0)
diff --git a/roll.c b/roll.c

index 39cee1843304ab5afb8754586b6ed81eb0526739..3e3a6e3271256e239f0ae759d921d207adaf8ae4 100644 (file)
--- a/roll.c
+++ b/roll.c
@@ -871,6 +871,17 @@ lafs_mount(struct fs *fs)
                 fs->cleaner.seg[d].chead = p;
                 INIT_LIST_HEAD(&fs->cleaner.seg[d].cleaning);
         }
+
+       ino = lafs_iget(fs->prime_sb, 3, SYNC);
+       if (!IS_ERR(ino)) {
+               if (LAFSI(ino)->type != TypeAccessTime) {
+                       iput(ino);
+                       err = -EINVAL;
+               } else
+                       LAFSI(fs->ss[0].root)->md.fs.accesstime = ino;
+       } else if (PTR_ERR(ino) != -ENOENT)
+               err = PTR_ERR(ino);
+
  err:
         putdref(b, MKREF(mount));
         return err;
diff --git a/state.h b/state.h

index f65fb08562455106a20304f9672a556f27001896..ecbc996cdd834632a01cae0eb8aeb35777b3df07 100644 (file)
--- a/state.h
+++ b/state.h
@@ -604,6 +604,7 @@ struct lafs_inode {
                         u32     inodes_used;
                         u32     quota_inums[3];
                         struct inode *quota_inodes[3];
+                       struct inode *accesstime;
                         char    *name;
                 } fs;
                 struct inodemap_md {
diff --git a/super.c b/super.c

index f96c3463b536f2c1dfb630e9e52d5525716c4880..7429bc9177d62956721f313edaa95ab24d9da9f3 100644 (file)
--- a/super.c
+++ b/super.c
@@ -755,6 +755,12 @@ static void lafs_kill_sb(struct super_block *sb)
                    fs->scan.done == 1 &&
                    fs->cleaner.active == 0);
  
+       if (LAFSI(fs->ss[0].root)->md.fs.accesstime) {
+               struct inode *i = LAFSI(fs->ss[0].root)->md.fs.accesstime;
+               LAFSI(fs->ss[0].root)->md.fs.accesstime = NULL;
+               iput(i);
+       }
+
         kill_anon_super(fs->prime_sb);
  
         bdi_destroy(&fs->bdi);
@@ -1041,6 +1047,18 @@ struct super_block *lafs_get_subset_sb(struct inode *ino)
                                 iput(imapfile);
                 }
  
+               if (!err) {
+                       struct inode *atime = lafs_iget(sb, 3, SYNC);
+                       if (!IS_ERR(atime)) {
+                               if (LAFSI(atime)->type != TypeAccessTime) {
+                                       iput(atime);
+                                       err = -EINVAL;
+                               } else
+                                       LAFSI(ino)->md.fs.accesstime = atime;
+                       } else if (PTR_ERR(atime) != -ENOENT)
+                               err = PTR_ERR(ino);
+               }
+
                 if (!err) {
                         sb->s_op = fs->prime_sb->s_op;
                         sb->s_flags |= MS_ACTIVE;
@@ -1136,6 +1154,7 @@ lafs_get_subset(struct file_system_type *fs_type,
                         md->quota_inodes[0] = NULL;
                         md->quota_inodes[1] = NULL;
                         md->quota_inodes[2] = NULL;
+                       md->accesstime = NULL;
                         md->name = NULL;
                         lafs_dirty_dblock(inodb);
                         lafs_dirty_inode(ino);
@@ -1171,6 +1190,10 @@ out_noput:
  static void lafs_kill_subset(struct super_block *sb)
  {
         struct sb_key *k = sb->s_fs_info;
+       if (LAFSI(k->root)->md.fs.accesstime) {
+               iput(LAFSI(k->root)->md.fs.accesstime);
+               LAFSI(k->root)->md.fs.accesstime = NULL;
+       }
         kill_anon_super(sb);
         iput(k->root);
         deactivate_super(k->fs->prime_sb);
author	NeilBrown <neilb@suse.de>
	Fri, 4 Mar 2011 23:44:02 +0000 (10:44 +1100)
committer	NeilBrown <neilb@suse.de>
	Fri, 4 Mar 2011 23:44:02 +0000 (10:44 +1100)
README		patch \| blob \| history
inode.c		patch \| blob \| history
roll.c		patch \| blob \| history
state.h		patch \| blob \| history
super.c		patch \| blob \| history