|
|
From bd970a7390d9af5ce859397a6c368d2465368d76 Mon Sep 17 00:00:00 2001
|
|
|
From: "Darrick J. Wong" <djwong@kernel.org>
|
|
|
Date: Wed, 31 May 2023 11:13:21 +0200
|
|
|
Subject: [PATCH] xfs: stabilize the dirent name transformation function used
|
|
|
for ascii-ci dir hash computation
|
|
|
MIME-Version: 1.0
|
|
|
Content-Type: text/plain; charset=UTF-8
|
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
|
|
Source kernel commit: a9248538facc3d9e769489e50a544509c2f9cebe
|
|
|
|
|
|
Back in the old days, the "ascii-ci" feature was created to implement
|
|
|
case-insensitive directory entry lookups for latin1-encoded names and
|
|
|
remove the large overhead of Samba's case-insensitive lookup code. UTF8
|
|
|
names were not allowed, but nobody explicitly wrote in the documentation
|
|
|
that this was only expected to work if the system used latin1 names.
|
|
|
The kernel tolower function was selected to prepare names for hashed
|
|
|
lookups.
|
|
|
|
|
|
There's a major discrepancy in the function that computes directory entry
|
|
|
hashes for filesystems that have ASCII case-insensitive lookups enabled.
|
|
|
The root of this is that the kernel and glibc's tolower implementations
|
|
|
have differing behavior for extended ASCII accented characters. I wrote
|
|
|
a program to spit out characters for which the tolower() return value is
|
|
|
different from the input:
|
|
|
|
|
|
glibc tolower:
|
|
|
65:A 66:B 67:C 68:D 69:E 70:F 71:G 72:H 73:I 74:J 75:K 76:L 77:M 78:N
|
|
|
79:O 80:P 81:Q 82:R 83:S 84:T 85:U 86:V 87:W 88:X 89:Y 90:Z
|
|
|
|
|
|
kernel tolower:
|
|
|
65:A 66:B 67:C 68:D 69:E 70:F 71:G 72:H 73:I 74:J 75:K 76:L 77:M 78:N
|
|
|
79:O 80:P 81:Q 82:R 83:S 84:T 85:U 86:V 87:W 88:X 89:Y 90:Z 192:À 193:Á
|
|
|
194:Â 195:Ã 196:Ä 197:Å 198:Æ 199:Ç 200:È 201:É 202:Ê 203:Ë 204:Ì 205:Í
|
|
|
206:Î 207:Ï 208:Ð 209:Ñ 210:Ò 211:Ó 212:Ô 213:Õ 214:Ö 215:× 216:Ø 217:Ù
|
|
|
218:Ú 219:Û 220:Ü 221:Ý 222:Þ
|
|
|
|
|
|
Which means that the kernel and userspace do not agree on the hash value
|
|
|
for a directory filename that contains those higher values. The hash
|
|
|
values are written into the leaf index block of directories that are
|
|
|
larger than two blocks in size, which means that xfs_repair will flag
|
|
|
these directories as having corrupted hash indexes and rewrite the index
|
|
|
with hash values that the kernel now will not recognize.
|
|
|
|
|
|
Because the ascii-ci feature is not frequently enabled and the kernel
|
|
|
touches filesystems far more frequently than xfs_repair does, fix this
|
|
|
by encoding the kernel's toupper predicate and tolower functions into
|
|
|
libxfs. Give the new functions less provocative names to make it really
|
|
|
obvious that this is a pre-hash name preparation function, and nothing
|
|
|
else. This change makes userspace's behavior consistent with the
|
|
|
kernel.
|
|
|
|
|
|
Found by auditing obfuscate_name in xfs_metadump as part of working on
|
|
|
parent pointers, wondering how it could possibly work correctly with ci
|
|
|
filesystems, writing a test tool to create a directory with
|
|
|
hash-colliding names, and watching xfs_repair flag it.
|
|
|
|
|
|
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
|
|
|
Reviewed-by: Christoph Hellwig <hch@lst.de>
|
|
|
Signed-off-by: Carlos Maiolino <cem@kernel.org>
|
|
|
Signed-off-by: Pavel Reichl <preichl@redhat.com>
|
|
|
---
|
|
|
libxfs/xfs_dir2.c | 5 +++--
|
|
|
libxfs/xfs_dir2.h | 31 +++++++++++++++++++++++++++++++
|
|
|
2 files changed, 34 insertions(+), 2 deletions(-)
|
|
|
|
|
|
diff --git a/libxfs/xfs_dir2.c b/libxfs/xfs_dir2.c
|
|
|
index d6a19296..c19684b3 100644
|
|
|
--- a/libxfs/xfs_dir2.c
|
|
|
+++ b/libxfs/xfs_dir2.c
|
|
|
@@ -63,7 +63,7 @@ xfs_ascii_ci_hashname(
|
|
|
int i;
|
|
|
|
|
|
for (i = 0, hash = 0; i < name->len; i++)
|
|
|
- hash = tolower(name->name[i]) ^ rol32(hash, 7);
|
|
|
+ hash = xfs_ascii_ci_xfrm(name->name[i]) ^ rol32(hash, 7);
|
|
|
|
|
|
return hash;
|
|
|
}
|
|
|
@@ -84,7 +84,8 @@ xfs_ascii_ci_compname(
|
|
|
for (i = 0; i < len; i++) {
|
|
|
if (args->name[i] == name[i])
|
|
|
continue;
|
|
|
- if (tolower(args->name[i]) != tolower(name[i]))
|
|
|
+ if (xfs_ascii_ci_xfrm(args->name[i]) !=
|
|
|
+ xfs_ascii_ci_xfrm(name[i]))
|
|
|
return XFS_CMP_DIFFERENT;
|
|
|
result = XFS_CMP_CASE;
|
|
|
}
|
|
|
diff --git a/libxfs/xfs_dir2.h b/libxfs/xfs_dir2.h
|
|
|
index dd39f17d..19af22a1 100644
|
|
|
--- a/libxfs/xfs_dir2.h
|
|
|
+++ b/libxfs/xfs_dir2.h
|
|
|
@@ -248,4 +248,35 @@ unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo,
|
|
|
struct xfs_dir2_data_hdr *hdr);
|
|
|
bool xfs_dir2_namecheck(const void *name, size_t length);
|
|
|
|
|
|
+/*
|
|
|
+ * The "ascii-ci" feature was created to speed up case-insensitive lookups for
|
|
|
+ * a Samba product. Because of the inherent problems with CI and UTF-8
|
|
|
+ * encoding, etc, it was decided that Samba would be configured to export
|
|
|
+ * latin1/iso 8859-1 encodings as that covered >90% of the target markets for
|
|
|
+ * the product. Hence the "ascii-ci" casefolding code could be encoded into
|
|
|
+ * the XFS directory operations and remove all the overhead of casefolding from
|
|
|
+ * Samba.
|
|
|
+ *
|
|
|
+ * To provide consistent hashing behavior between the userspace and kernel,
|
|
|
+ * these functions prepare names for hashing by transforming specific bytes
|
|
|
+ * to other bytes. Robustness with other encodings is not guaranteed.
|
|
|
+ */
|
|
|
+static inline bool xfs_ascii_ci_need_xfrm(unsigned char c)
|
|
|
+{
|
|
|
+ if (c >= 0x41 && c <= 0x5a) /* A-Z */
|
|
|
+ return true;
|
|
|
+ if (c >= 0xc0 && c <= 0xd6) /* latin A-O with accents */
|
|
|
+ return true;
|
|
|
+ if (c >= 0xd8 && c <= 0xde) /* latin O-Y with accents */
|
|
|
+ return true;
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c)
|
|
|
+{
|
|
|
+ if (xfs_ascii_ci_need_xfrm(c))
|
|
|
+ c -= 'A' - 'a';
|
|
|
+ return c;
|
|
|
+}
|
|
|
+
|
|
|
#endif /* __XFS_DIR2_H__ */
|
|
|
--
|
|
|
2.41.0
|
|
|
|