From 55fe6f51f41f254d3d87994d18bff04664aa013b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 8 Mar 2015 17:12:24 +0700 Subject: dir.c: optionally compute sha-1 of a .gitignore file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is not used anywhere yet. But the goal is to compare quickly if a .gitignore file has changed when we have the SHA-1 of both old (cached somewhere) and new (from index or a tree) versions. Helped-by: Junio C Hamano Helped-by: Torsten Bögershausen Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- dir.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'dir.h') diff --git a/dir.h b/dir.h index 6c45e9d4b..cdca71b3b 100644 --- a/dir.h +++ b/dir.h @@ -73,6 +73,12 @@ struct exclude_list_group { struct exclude_list *el; }; +struct sha1_stat { + struct stat_data stat; + unsigned char sha1[20]; + int valid; +}; + struct dir_struct { int nr, alloc; int ignored_nr, ignored_alloc; -- cgit v1.2.1 From 0dcb8d7fe0ec2687d4a6ae201ae72907d862437c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 8 Mar 2015 17:12:25 +0700 Subject: untracked cache: record .gitignore information and dir hierarchy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idea is if we can capture all input and (non-rescursive) output of read_directory_recursive(), and can verify later that all the input is the same, then the second r_d_r() should produce the same output as in the first run. The requirement for this to work is stat info of a directory MUST change if an entry is added to or removed from that directory (and should not change often otherwise). If your OS and filesystem do not meet this requirement, untracked cache is not for you. Most file systems on *nix should be fine. On Windows, NTFS is fine while FAT may not be [1] even though FAT on Linux seems to be fine. The list of input of r_d_r() is in the big comment block in dir.h. In short, the output of a directory (not counting subdirs) mainly depends on stat info of the directory in question, all .gitignore leading to it and the check_only flag when r_d_r() is called recursively. This patch records all this info (and the output) as r_d_r() runs. Two hash_sha1_file() are required for $GIT_DIR/info/exclude and core.excludesfile unless their stat data matches. hash_sha1_file() is only needed when .gitignore files in the worktree are modified, otherwise their SHA-1 in index is used (see the previous patch). We could store stat data for .gitignore files so we don't have to rehash them if their content is different from index, but I think .gitignore files are rarely modified, so not worth extra cache data (and hashing penalty read-cache.c:verify_hdr(), as we will be storing this as an index extension). The implication is, if you change .gitignore, you better add it to the index soon or you lose all the benefit of untracked cache because a modified .gitignore invalidates all subdirs recursively. This is especially bad for .gitignore at root. This cached output is about untracked files only, not ignored files because the number of tracked files is usually small, so small cache overhead, while the number of ignored files could go really high (e.g. *.o files mixing with source code). [1] "Description of NTFS date and time stamps for files and folders" http://support.microsoft.com/kb/299648 Helped-by: Torsten Bögershausen Helped-by: David Turner Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- dir.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'dir.h') diff --git a/dir.h b/dir.h index cdca71b3b..9ab74b4c1 100644 --- a/dir.h +++ b/dir.h @@ -66,6 +66,7 @@ struct exclude_stack { struct exclude_stack *prev; /* the struct exclude_stack for the parent directory */ int baselen; int exclude_ix; /* index of exclude_list within EXC_DIRS exclude_list_group */ + struct untracked_cache_dir *ucd; }; struct exclude_list_group { @@ -79,6 +80,60 @@ struct sha1_stat { int valid; }; +/* + * Untracked cache + * + * The following inputs are sufficient to determine what files in a + * directory are excluded: + * + * - The list of files and directories of the directory in question + * - The $GIT_DIR/index + * - dir_struct flags + * - The content of $GIT_DIR/info/exclude + * - The content of core.excludesfile + * - The content (or the lack) of .gitignore of all parent directories + * from $GIT_WORK_TREE + * - The check_only flag in read_directory_recursive (for + * DIR_HIDE_EMPTY_DIRECTORIES) + * + * The first input can be checked using directory mtime. In many + * filesystems, directory mtime (stat_data field) is updated when its + * files or direct subdirs are added or removed. + * + * The second one can be hooked from cache_tree_invalidate_path(). + * Whenever a file (or a submodule) is added or removed from a + * directory, we invalidate that directory. + * + * The remaining inputs are easy, their SHA-1 could be used to verify + * their contents (exclude_sha1[], info_exclude_sha1[] and + * excludes_file_sha1[]) + */ +struct untracked_cache_dir { + struct untracked_cache_dir **dirs; + char **untracked; + struct stat_data stat_data; + unsigned int untracked_alloc, dirs_nr, dirs_alloc; + unsigned int untracked_nr; + unsigned int check_only : 1; + /* null SHA-1 means this directory does not have .gitignore */ + unsigned char exclude_sha1[20]; + char name[FLEX_ARRAY]; +}; + +struct untracked_cache { + struct sha1_stat ss_info_exclude; + struct sha1_stat ss_excludes_file; + const char *exclude_per_dir; + /* + * dir_struct#flags must match dir_flags or the untracked + * cache is ignored. + */ + unsigned dir_flags; + struct untracked_cache_dir *root; + /* Statistics */ + int dir_created; +}; + struct dir_struct { int nr, alloc; int ignored_nr, ignored_alloc; @@ -126,6 +181,11 @@ struct dir_struct { struct exclude_stack *exclude_stack; struct exclude *exclude; struct strbuf basebuf; + + /* Enable untracked file cache if set */ + struct untracked_cache *untracked; + struct sha1_stat ss_info_exclude; + struct sha1_stat ss_excludes_file; }; /* -- cgit v1.2.1 From ccad261f07900b55029f3fd42a9ec8f17229808f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 8 Mar 2015 17:12:26 +0700 Subject: untracked cache: initial untracked cache validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure the starting conditions and all global exclude files are good to go. If not, either disable untracked cache completely, or wipe out the cache and start fresh. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- dir.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'dir.h') diff --git a/dir.h b/dir.h index 9ab74b4c1..1d7a9585f 100644 --- a/dir.h +++ b/dir.h @@ -115,6 +115,8 @@ struct untracked_cache_dir { unsigned int untracked_alloc, dirs_nr, dirs_alloc; unsigned int untracked_nr; unsigned int check_only : 1; + /* all data in this struct are good */ + unsigned int valid : 1; /* null SHA-1 means this directory does not have .gitignore */ unsigned char exclude_sha1[20]; char name[FLEX_ARRAY]; @@ -132,6 +134,7 @@ struct untracked_cache { struct untracked_cache_dir *root; /* Statistics */ int dir_created; + int gitignore_invalidated; }; struct dir_struct { @@ -186,6 +189,7 @@ struct dir_struct { struct untracked_cache *untracked; struct sha1_stat ss_info_exclude; struct sha1_stat ss_excludes_file; + unsigned unmanaged_exclude_files; }; /* -- cgit v1.2.1 From 91a2288b5f63fba82e912dca475154d5b9dd233a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 8 Mar 2015 17:12:29 +0700 Subject: untracked cache: record/validate dir mtime and reuse cached output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main readdir loop in read_directory_recursive() is replaced with a new one that checks if cached results of a directory is still valid. If a file is added or removed from the index, the containing directory is invalidated (but not its subdirs). If directory's mtime is changed, the same happens. If a .gitignore is updated, the containing directory and all subdirs are invalidated recursively. If dir_struct#flags or other conditions change, the cache is ignored. If a directory is invalidated, we opendir/readdir/closedir and run the exclude machinery on that directory listing as usual. If untracked cache is also enabled, we'll update the cache along the way. If a directory is validated, we simply pull the untracked listing out from the cache. The cache also records the list of direct subdirs that we have to recurse in. Fully excluded directories are seen as "untracked files". In the best case when no dirs are invalidated, read_directory() becomes a series of stat(dir), open(.gitignore), fstat(), read(), close() and optionally hash_sha1_file() For comparison, standard read_directory() is a sequence of opendir(), readdir(), open(.gitignore), fstat(), read(), close(), the expensive last_exclude_matching() and closedir(). We already try not to open(.gitignore) if we know it does not exist, so open/fstat/read/close sequence does not apply to every directory. The sequence could be reduced further, as noted in prep_exclude() in another patch. So in theory, the entire best-case read_directory sequence could be reduced to a series of stat() and nothing else. This is not a silver bullet approach. When you compile a C file, for example, the old .o file is removed and a new one with the same name created, effectively invalidating the containing directory's cache (but not its subdirectories). If your build process touches every directory, this cache adds extra overhead for nothing, so it's a good idea to separate generated files from tracked files.. Editors may use the same strategy for saving files. And of course you're out of luck running your repo on an unsupported filesystem and/or operating system. Helped-by: Eric Sunshine Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- dir.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'dir.h') diff --git a/dir.h b/dir.h index 1d7a9585f..ff3d99bcb 100644 --- a/dir.h +++ b/dir.h @@ -135,6 +135,8 @@ struct untracked_cache { /* Statistics */ int dir_created; int gitignore_invalidated; + int dir_invalidated; + int dir_opened; }; struct dir_struct { -- cgit v1.2.1 From 26cb0182b8b2e119f469750b3511fac4624f6667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 8 Mar 2015 17:12:30 +0700 Subject: untracked cache: mark what dirs should be recursed/saved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we redo this thing in a functional style, we would have one struct untracked_dir as input tree and another as output. The input is used for verification. The output is a brand new tree, reflecting current worktree. But that means recreate a lot of dir nodes even if a lot could be shared between input and output trees in good cases. So we go with the messy but efficient way, combining both input and output trees into one. We need a way to know which node in this combined tree belongs to the output. This is the purpose of this "recurse" flag. "valid" bit can't be used for this because it's about data of the node except the subdirs. When we invalidate a directory, we want to keep cached data of the subdirs intact even though we don't really know what subdir still exists (yet). Then we check worktree to see what actual subdir remains on disk. Those will have 'recurse' bit set again. If cached data for those are still valid, we may be able to avoid computing exclude files for them. Those subdirs that are deleted will have 'recurse' remained clear and their 'valid' bits do not matter. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- dir.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'dir.h') diff --git a/dir.h b/dir.h index ff3d99bcb..95baf014c 100644 --- a/dir.h +++ b/dir.h @@ -115,8 +115,9 @@ struct untracked_cache_dir { unsigned int untracked_alloc, dirs_nr, dirs_alloc; unsigned int untracked_nr; unsigned int check_only : 1; - /* all data in this struct are good */ + /* all data except 'dirs' in this struct are good */ unsigned int valid : 1; + unsigned int recurse : 1; /* null SHA-1 means this directory does not have .gitignore */ unsigned char exclude_sha1[20]; char name[FLEX_ARRAY]; -- cgit v1.2.1 From 83c094ad0dd2104adbbec034f802dceb1d052981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 8 Mar 2015 17:12:33 +0700 Subject: untracked cache: save to an index extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Helped-by: Stefan Beller Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- dir.h | 1 + 1 file changed, 1 insertion(+) (limited to 'dir.h') diff --git a/dir.h b/dir.h index 95baf014c..dc3ee0b2e 100644 --- a/dir.h +++ b/dir.h @@ -298,4 +298,5 @@ static inline int dir_path_match(const struct dir_entry *ent, has_trailing_dir); } +void write_untracked_extension(struct strbuf *out, struct untracked_cache *untracked); #endif -- cgit v1.2.1 From f9e6c649589e0940ccb82821107fb658277ed86b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 8 Mar 2015 17:12:34 +0700 Subject: untracked cache: load from UNTR index extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- dir.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'dir.h') diff --git a/dir.h b/dir.h index dc3ee0b2e..40a679a80 100644 --- a/dir.h +++ b/dir.h @@ -298,5 +298,7 @@ static inline int dir_path_match(const struct dir_entry *ent, has_trailing_dir); } +void free_untracked_cache(struct untracked_cache *); +struct untracked_cache *read_untracked_extension(const void *data, unsigned long sz); void write_untracked_extension(struct strbuf *out, struct untracked_cache *untracked); #endif -- cgit v1.2.1 From e931371a8f1164185486a1f5fdaaa708b4a6217c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 8 Mar 2015 17:12:35 +0700 Subject: untracked cache: invalidate at index addition or removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ideally we should implement untracked_cache_remove_from_index() and untracked_cache_add_to_index() so that they update untracked cache right away instead of invalidating it and wait for read_directory() next time to deal with it. But that may need some more work in unpack-trees.c. So stay simple as the first step. The new call in add_index_entry_with_check() may look strange because new calls usually stay close to cache_tree_invalidate_path(). We do it a bit later than c_t_i_p() in this function because if it's about replacing the entry with the same name, we don't care (but cache-tree does). Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- dir.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'dir.h') diff --git a/dir.h b/dir.h index 40a679a80..2ce7dd3d2 100644 --- a/dir.h +++ b/dir.h @@ -298,6 +298,10 @@ static inline int dir_path_match(const struct dir_entry *ent, has_trailing_dir); } +void untracked_cache_invalidate_path(struct index_state *, const char *); +void untracked_cache_remove_from_index(struct index_state *, const char *); +void untracked_cache_add_to_index(struct index_state *, const char *); + void free_untracked_cache(struct untracked_cache *); struct untracked_cache *read_untracked_extension(const void *data, unsigned long sz); void write_untracked_extension(struct strbuf *out, struct untracked_cache *untracked); -- cgit v1.2.1 From 1e8fef609e78110e276df633c5ba1fb1f1589fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 8 Mar 2015 17:12:46 +0700 Subject: untracked cache: guard and disable on system changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the user enables untracked cache, then - move worktree to an unsupported filesystem - or simply upgrade OS - or move the whole (portable) disk from one machine to another - or access a shared fs from another machine there's no guarantee that untracked cache can still function properly. Record the worktree location and OS footprint in the cache. If it changes, err on the safe side and disable the cache. The user can 'update-index --untracked-cache' again to make sure all conditions are met. This adds a new requirement that setup_git_directory* must be called before read_cache() because we need worktree location by then, or the cache is dropped. This change does not cover all bases, you can fool it if you try hard. The point is to stop accidents. Helped-by: Eric Sunshine Helped-by: brian m. carlson Helped-by: Torsten Bögershausen Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- dir.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'dir.h') diff --git a/dir.h b/dir.h index 2ce7dd3d2..6ccbc454a 100644 --- a/dir.h +++ b/dir.h @@ -127,6 +127,7 @@ struct untracked_cache { struct sha1_stat ss_info_exclude; struct sha1_stat ss_excludes_file; const char *exclude_per_dir; + struct strbuf ident; /* * dir_struct#flags must match dir_flags or the untracked * cache is ignored. @@ -305,4 +306,5 @@ void untracked_cache_add_to_index(struct index_state *, const char *); void free_untracked_cache(struct untracked_cache *); struct untracked_cache *read_untracked_extension(const void *data, unsigned long sz); void write_untracked_extension(struct strbuf *out, struct untracked_cache *untracked); +void add_untracked_ident(struct untracked_cache *); #endif -- cgit v1.2.1