-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[libcxx] Cache file attributes during directory iteration. #93316
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write If you have received no comments on your PR for a week, you can request a review If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
@llvm/pr-subscribers-libcxx Author: Eduard Satdarov (ed-sat) ChangesAdded caching file attributes during directory iteration for OS Windows. Allows improving performance working with files in directory. Full diff: https://github.com/llvm/llvm-project/pull/93316.diff 3 Files Affected:
diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h
index 016ad94a853dc..6d35d92ebf483 100644
--- a/libcxx/include/__filesystem/directory_entry.h
+++ b/libcxx/include/__filesystem/directory_entry.h
@@ -200,6 +200,8 @@ class directory_entry {
_Empty,
_IterSymlink,
_IterNonSymlink,
+ _IterCachedSymlink,
+ _IterCachedNonSymlink,
_RefreshSymlink,
_RefreshSymlinkUnresolved,
_RefreshNonSymlink
@@ -241,6 +243,28 @@ class directory_entry {
return __data;
}
+ _LIBCPP_HIDE_FROM_ABI static __cached_data __create_iter_cached_result(file_type __ft, uintmax_t __size, perms __perm, file_time_type __write_time) {
+ __cached_data __data;
+ __data.__type_ = __ft;
+ __data.__size_ = __size;
+ __data.__write_time_ = __write_time;
+ if (__ft == file_type::symlink)
+ __data.__sym_perms_ = __perm;
+ else
+ __data.__non_sym_perms_ = __perm;
+ __data.__cache_type_ = [&]() {
+ switch (__ft) {
+ case file_type::none:
+ return _Empty;
+ case file_type::symlink:
+ return _IterCachedSymlink;
+ default:
+ return _IterCachedNonSymlink;
+ }
+ }();
+ return __data;
+ }
+
_LIBCPP_HIDE_FROM_ABI void __assign_iter_entry(_Path&& __p, __cached_data __dt) {
__p_ = std::move(__p);
__data_ = __dt;
@@ -282,12 +306,14 @@ class directory_entry {
case _Empty:
return __symlink_status(__p_, __ec).type();
case _IterSymlink:
+ case _IterCachedSymlink:
case _RefreshSymlink:
case _RefreshSymlinkUnresolved:
if (__ec)
__ec->clear();
return file_type::symlink;
case _IterNonSymlink:
+ case _IterCachedNonSymlink:
case _RefreshNonSymlink:
file_status __st(__data_.__type_);
if (__ec && !filesystem::exists(__st))
@@ -303,9 +329,11 @@ class directory_entry {
switch (__data_.__cache_type_) {
case _Empty:
case _IterSymlink:
+ case _IterCachedSymlink:
case _RefreshSymlinkUnresolved:
return __status(__p_, __ec).type();
case _IterNonSymlink:
+ case _IterCachedNonSymlink:
case _RefreshNonSymlink:
case _RefreshSymlink: {
file_status __st(__data_.__type_);
@@ -324,8 +352,10 @@ class directory_entry {
case _Empty:
case _IterNonSymlink:
case _IterSymlink:
+ case _IterCachedSymlink:
case _RefreshSymlinkUnresolved:
return __status(__p_, __ec);
+ case _IterCachedNonSymlink:
case _RefreshNonSymlink:
case _RefreshSymlink:
return file_status(__get_ft(__ec), __data_.__non_sym_perms_);
@@ -339,8 +369,10 @@ class directory_entry {
case _IterNonSymlink:
case _IterSymlink:
return __symlink_status(__p_, __ec);
+ case _IterCachedNonSymlink:
case _RefreshNonSymlink:
return file_status(__get_sym_ft(__ec), __data_.__non_sym_perms_);
+ case _IterCachedSymlink:
case _RefreshSymlink:
case _RefreshSymlinkUnresolved:
return file_status(__get_sym_ft(__ec), __data_.__sym_perms_);
@@ -353,8 +385,10 @@ class directory_entry {
case _Empty:
case _IterNonSymlink:
case _IterSymlink:
+ case _IterCachedSymlink:
case _RefreshSymlinkUnresolved:
return filesystem::__file_size(__p_, __ec);
+ case _IterCachedNonSymlink:
case _RefreshSymlink:
case _RefreshNonSymlink: {
error_code __m_ec;
@@ -375,6 +409,8 @@ class directory_entry {
case _Empty:
case _IterNonSymlink:
case _IterSymlink:
+ case _IterCachedNonSymlink:
+ case _IterCachedSymlink:
case _RefreshSymlinkUnresolved:
return filesystem::__hard_link_count(__p_, __ec);
case _RefreshSymlink:
@@ -395,6 +431,8 @@ class directory_entry {
case _IterSymlink:
case _RefreshSymlinkUnresolved:
return filesystem::__last_write_time(__p_, __ec);
+ case _IterCachedNonSymlink:
+ case _IterCachedSymlink:
case _RefreshSymlink:
case _RefreshNonSymlink: {
error_code __m_ec;
diff --git a/libcxx/src/filesystem/directory_iterator.cpp b/libcxx/src/filesystem/directory_iterator.cpp
index dceb3486279f8..d7ed9a358f559 100644
--- a/libcxx/src/filesystem/directory_iterator.cpp
+++ b/libcxx/src/filesystem/directory_iterator.cpp
@@ -77,13 +77,13 @@ class __dir_stream {
bool assign() {
if (!wcscmp(__data_.cFileName, L".") || !wcscmp(__data_.cFileName, L".."))
return false;
- // FIXME: Cache more of this
- // directory_entry::__cached_data cdata;
- // cdata.__type_ = get_file_type(__data_);
- // cdata.__size_ = get_file_size(__data_);
- // cdata.__write_time_ = get_write_time(__data_);
__entry_.__assign_iter_entry(
- __root_ / __data_.cFileName, directory_entry::__create_iter_result(detail::get_file_type(__data_)));
+ __root_ / __data_.cFileName,
+ directory_entry::__create_iter_cached_result(
+ detail::get_file_type(__data_),
+ detail::get_file_size(__data_),
+ detail::get_file_perm(__data_),
+ detail::get_write_time(__data_)));
return true;
}
diff --git a/libcxx/src/filesystem/file_descriptor.h b/libcxx/src/filesystem/file_descriptor.h
index 50178ff84e03f..44a072b72e766 100644
--- a/libcxx/src/filesystem/file_descriptor.h
+++ b/libcxx/src/filesystem/file_descriptor.h
@@ -97,11 +97,17 @@ inline uintmax_t get_file_size(const WIN32_FIND_DATAW& data) {
return (static_cast<uint64_t>(data.nFileSizeHigh) << 32) + data.nFileSizeLow;
}
inline file_time_type get_write_time(const WIN32_FIND_DATAW& data) {
- ULARGE_INTEGER tmp;
- const FILETIME& time = data.ftLastWriteTime;
- tmp.u.LowPart = time.dwLowDateTime;
- tmp.u.HighPart = time.dwHighDateTime;
- return file_time_type(file_time_type::duration(tmp.QuadPart));
+ using detail::fs_time;
+ auto ts = filetime_to_timespec(time);
+ if (!fs_time::is_representable(ts))
+ return file_time_type::min();
+ return fs_time::convert_from_timespec(ts);
+}
+inline perms get_file_perm(const WIN32_FIND_DATAW& data) {
+ unsigned st_mode = 0555; // Read-only
+ if (!(data.dwFileAttributes & FILE_ATTRIBUTE_READONLY))
+ st_mode |= 0222; // Write
+ return static_cast<perms>(st_mode) & perms::mask;
}
#endif // !_LIBCPP_WIN32API
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm concerned this opens the library up to more TOCTOU attacks. Performance is one consideration, but correctness and security are others. Is it possible for the cached values to become stale/incorrect?
I would like to see some tests that the cached values are actually correct when entries in the directory change.
Will be upstreamed in/ llvm/llvm-project#93316 c8062150c5a7da5f056ef6edbaf5dd8c76020903
Will be upstreamed in/ llvm/llvm-project#93316 c8062150c5a7da5f056ef6edbaf5dd8c76020903
Will be upstreamed in/ llvm/llvm-project#93316 c8062150c5a7da5f056ef6edbaf5dd8c76020903
Will be upstreamed in/ llvm/llvm-project#93316 c8062150c5a7da5f056ef6edbaf5dd8c76020903
TL;DR: I do not consider more TOCTOU and less TOCTOU as the correct method of thinking. The implementation either protects you from TOCTOU (none does) or not. @EricWF, thanks for pointing to this issue. There is, of course, TOCTOU issue with the code, yet this PR neither introduced it nor do we consider it as a problem. filesystem is a naturally shared resource not controlled by a single process, hence one should either
OR
I do not think that either option is an std library responsibility.
So, technically speaking, this PR makes current implementation more conformant to the standard than it was before. Now, let's observe particular consequences brought by this PR. First of all, this PR only affects POSIX-implementations will require additional lstat / stat syscall upon accessing the first attribute. libc++ can cache attributes inside directory_entry (upon first invocation of In both implementations the user can call References to other implementations
|
There is also p0317 with a thorough considerations on the subject. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
Thank you for the detailed response. I really appreciate all of the context and careful consideration. You've thoroughly addressed my question. I would still request that we add tests that observe the change in behavior. At the moment, the caching behavior could regress without notice. I'll happily approve the change with tests. |
08ca1dc
to
e6c1daf
Compare
Thank you. I wrote test to compare cached values with values after calling |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please wait until the bots come back.
But otherwise it looks good to me.
libcxx/test/std/input.output/filesystems/class.rec.dir.itr/cache_refresh_iter.pass.cpp
Outdated
Show resolved
Hide resolved
@EricWF, could you, please approve it once more? The latest version should have the CI fixed. Could you, please, provide us with some clues? |
Will be upstreamed in/ llvm/llvm-project#93316 c8062150c5a7da5f056ef6edbaf5dd8c76020903
@georgthegreat Approved the workflow just now. The CI failure appears to be happening on macOS. It's too late right now but I will have a look tomorrow or in the coming days, I should be able to help a bit since I have access to these machines more easily than most people. |
Ahhhh, I understand what's going on. I think that as-is, this change is actually an ABI break but there should be a way to deploy it. Basically, The typical way of making changes of this kind is to guard the new code you want to introduce behind a macro that basically says "I want promise I am not deploying to a system that doesn't have the changes in this PR". You can look at how e.g. #if _LIBCPP_AVAILABILITY_HAS_FULLY_POPULATED_CACHED_ENTRY
...
#endif where, in #define _LIBCPP_AVAILABILITY_HAS_FULLY_POPULATED_CACHED_ENTRY _LIBCPP_INTRODUCED_IN_LLVM_19 You might also want to slightly reorganize the code changes in your PR to avoid making it a maze of |
@ldionne, thanks a lot. We use fully static linkage, so ABI breakages are not something that we use to work with. |
PR will need to change to avoid breaking ABI
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Requesting changes just so it shows up correctly in the PR queue. Let me know if you need help with implementing what I outlined above.
adc8881
to
5f78cfc
Compare
Just a reminder that the LLVM 19 branch point is happening next week, so if we want this to make it into the branch we'd have to land it soon. |
Ping. It would be great to have this in LLVM 19. The sooner we have this in, the sooner we can eventually remove the backdeployment logic (a long time from now but still). |
Hello. Sorry I was very busy on my work. I will spend today and weekends on fixes. |
6d4fe04
to
89c5d7e
Compare
Hello @ldionne |
@ldionne, it looks like the current implementation is both ABI-stable and straight forward to review. |
@ldionne, gentle ping. |
@ldionne, let's merge it! |
Ack, looking now. Sorry for the delay, this went under my radar and then I started focusing more on things that could definitely hit the LLVM 19 milestone. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with minor comments.
libcxx/test/std/input.output/filesystems/class.rec.dir.itr/cache_refresh_iter.pass.cpp
Outdated
Show resolved
Hide resolved
libcxx/test/std/input.output/filesystems/class.rec.dir.itr/cache_refresh_iter.pass.cpp
Outdated
Show resolved
Hide resolved
libcxx/test/std/input.output/filesystems/class.rec.dir.itr/cache_refresh_iter.pass.cpp
Outdated
Show resolved
Hide resolved
@EricWF Please take a look after all changes and fixes. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM once CI is green. The ARM CI might be failing due to the zdump issue -- if so that's unrelated to this patch and shouldn't block merging. Whoever sees the green CI first can merge.
@ldionne, sorry, but could you help us to interpret the failures? It looks like aix tests have failed too... |
I think none of the failures are related to this change. Merging, thanks! |
@ed-sat Congratulations on having your first Pull Request (PR) merged into the LLVM Project! Your changes will be combined with recent changes from other authors, then tested by our build bots. If there is a problem with a build, you may receive a report in an email or a comment on this PR. Please check whether problems have been caused by your change specifically, as the builds can include changes from many authors. It is not uncommon for your change to be included in a build that fails due to someone else's changes, or infrastructure issues. How to do this, and the rest of the post-merge process, is covered in detail here. If your change does cause a problem, it may be reverted, or you can revert it yourself. This is a normal part of LLVM development. You can fix your changes and open a new PR to merge them again. If you don't get any reports, no action is required from you. Your changes are working as expected, well done! |
This backports [PR #93316](llvm/llvm-project#93316) for real. commit_hash:c06d30afb498591c84dd306c44227026d402b479
This backports [PR #93316](llvm/llvm-project#93316) for real. commit_hash:c06d30afb498591c84dd306c44227026d402b479
This backports [PR #93316](llvm/llvm-project#93316) for real. commit_hash:c06d30afb498591c84dd306c44227026d402b479
This backports [PR #93316](llvm/llvm-project#93316) for real. commit_hash:c06d30afb498591c84dd306c44227026d402b479
This backports [PR #93316](llvm/llvm-project#93316) for real. commit_hash:c06d30afb498591c84dd306c44227026d402b479
This backports [PR #93316](llvm/llvm-project#93316) for real. commit_hash:c06d30afb498591c84dd306c44227026d402b479
Added caching file attributes during directory iteration for OS Windows. Allows improving performance working with files in directory.