Skip to content

Commit 4e60cdd

Browse files
authored
Percent Decode URL Paths (#8009) (#8012)
* Treat ListingTableUrl as URL-encoded (#8009) * Update lockfile * Review feedback
1 parent 69ba82f commit 4e60cdd

File tree

3 files changed

+38
-12
lines changed

3 files changed

+38
-12
lines changed

datafusion-cli/Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/core/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ num_cpus = "1.13.0"
8282
object_store = "0.7.0"
8383
parking_lot = "0.12"
8484
parquet = { workspace = true, optional = true }
85-
percent-encoding = "2.2.0"
8685
pin-project-lite = "^0.2.7"
8786
rand = "0.8"
8887
sqlparser = { workspace = true }

datafusion/core/src/datasource/listing/url.rs

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ use itertools::Itertools;
2727
use log::debug;
2828
use object_store::path::Path;
2929
use object_store::{ObjectMeta, ObjectStore};
30-
use percent_encoding;
3130
use std::sync::Arc;
3231
use url::Url;
3332

@@ -46,6 +45,16 @@ pub struct ListingTableUrl {
4645
impl ListingTableUrl {
4746
/// Parse a provided string as a `ListingTableUrl`
4847
///
48+
/// # URL Encoding
49+
///
50+
/// URL paths are expected to be URL-encoded. That is, the URL for a file named `bar%2Efoo`
51+
/// would be `file:///bar%252Efoo`, as per the [URL] specification.
52+
///
53+
/// It should be noted that some tools, such as the AWS CLI, take a different approach and
54+
/// instead interpret the URL path verbatim. For example the object `bar%2Efoo` would be
55+
/// addressed as `s3://BUCKET/bar%252Efoo` using [`ListingTableUrl`] but `s3://BUCKET/bar%2Efoo`
56+
/// when using the aws-cli.
57+
///
4958
/// # Paths without a Scheme
5059
///
5160
/// If no scheme is provided, or the string is an absolute filesystem path
@@ -77,6 +86,7 @@ impl ListingTableUrl {
7786
/// filter when listing files from object storage
7887
///
7988
/// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme
89+
/// [URL]: https://url.spec.whatwg.org/
8090
pub fn parse(s: impl AsRef<str>) -> Result<Self> {
8191
let s = s.as_ref();
8292

@@ -86,7 +96,7 @@ impl ListingTableUrl {
8696
}
8797

8898
match Url::parse(s) {
89-
Ok(url) => Ok(Self::new(url, None)),
99+
Ok(url) => Self::try_new(url, None),
90100
Err(url::ParseError::RelativeUrlWithoutBase) => Self::parse_path(s),
91101
Err(e) => Err(DataFusionError::External(Box::new(e))),
92102
}
@@ -138,15 +148,13 @@ impl ListingTableUrl {
138148
.map_err(|_| DataFusionError::Internal(format!("Can not open path: {s}")))?;
139149
// TODO: Currently we do not have an IO-related error variant that accepts ()
140150
// or a string. Once we have such a variant, change the error type above.
141-
Ok(Self::new(url, glob))
151+
Self::try_new(url, glob)
142152
}
143153

144154
/// Creates a new [`ListingTableUrl`] from a url and optional glob expression
145-
fn new(url: Url, glob: Option<Pattern>) -> Self {
146-
let decoded_path =
147-
percent_encoding::percent_decode_str(url.path()).decode_utf8_lossy();
148-
let prefix = Path::from(decoded_path.as_ref());
149-
Self { url, prefix, glob }
155+
fn try_new(url: Url, glob: Option<Pattern>) -> Result<Self> {
156+
let prefix = Path::from_url_path(url.path())?;
157+
Ok(Self { url, prefix, glob })
150158
}
151159

152160
/// Returns the URL scheme
@@ -286,6 +294,7 @@ fn split_glob_expression(path: &str) -> Option<(&str, &str)> {
286294
#[cfg(test)]
287295
mod tests {
288296
use super::*;
297+
use tempfile::tempdir;
289298

290299
#[test]
291300
fn test_prefix_path() {
@@ -317,8 +326,27 @@ mod tests {
317326
let url = ListingTableUrl::parse("file:///foo/bar?").unwrap();
318327
assert_eq!(url.prefix.as_ref(), "foo/bar");
319328

320-
let url = ListingTableUrl::parse("file:///foo/😺").unwrap();
321-
assert_eq!(url.prefix.as_ref(), "foo/%F0%9F%98%BA");
329+
let err = ListingTableUrl::parse("file:///foo/😺").unwrap_err();
330+
assert_eq!(err.to_string(), "Object Store error: Encountered object with invalid path: Error parsing Path \"/foo/😺\": Encountered illegal character sequence \"😺\" whilst parsing path segment \"😺\"");
331+
332+
let url = ListingTableUrl::parse("file:///foo/bar%2Efoo").unwrap();
333+
assert_eq!(url.prefix.as_ref(), "foo/bar.foo");
334+
335+
let url = ListingTableUrl::parse("file:///foo/bar%2Efoo").unwrap();
336+
assert_eq!(url.prefix.as_ref(), "foo/bar.foo");
337+
338+
let url = ListingTableUrl::parse("file:///foo/bar%252Ffoo").unwrap();
339+
assert_eq!(url.prefix.as_ref(), "foo/bar%2Ffoo");
340+
341+
let url = ListingTableUrl::parse("file:///foo/a%252Fb.txt").unwrap();
342+
assert_eq!(url.prefix.as_ref(), "foo/a%2Fb.txt");
343+
344+
let dir = tempdir().unwrap();
345+
let path = dir.path().join("bar%2Ffoo");
346+
std::fs::File::create(&path).unwrap();
347+
348+
let url = ListingTableUrl::parse(path.to_str().unwrap()).unwrap();
349+
assert!(url.prefix.as_ref().ends_with("bar%2Ffoo"), "{}", url.prefix);
322350
}
323351

324352
#[test]

0 commit comments

Comments
 (0)