@@ -27,7 +27,6 @@ use itertools::Itertools;
2727use log:: debug;
2828use object_store:: path:: Path ;
2929use object_store:: { ObjectMeta , ObjectStore } ;
30- use percent_encoding;
3130use std:: sync:: Arc ;
3231use url:: Url ;
3332
@@ -46,6 +45,16 @@ pub struct ListingTableUrl {
4645impl ListingTableUrl {
4746 /// Parse a provided string as a `ListingTableUrl`
4847 ///
48+ /// # URL Encoding
49+ ///
50+ /// URL paths are expected to be URL-encoded. That is, the URL for a file named `bar%2Efoo`
51+ /// would be `file:///bar%252Efoo`, as per the [URL] specification.
52+ ///
53+ /// It should be noted that some tools, such as the AWS CLI, take a different approach and
54+ /// instead interpret the URL path verbatim. For example the object `bar%2Efoo` would be
55+ /// addressed as `s3://BUCKET/bar%252Efoo` using [`ListingTableUrl`] but `s3://BUCKET/bar%2Efoo`
56+ /// when using the aws-cli.
57+ ///
4958 /// # Paths without a Scheme
5059 ///
5160 /// If no scheme is provided, or the string is an absolute filesystem path
@@ -77,6 +86,7 @@ impl ListingTableUrl {
7786 /// filter when listing files from object storage
7887 ///
7988 /// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme
89+ /// [URL]: https://url.spec.whatwg.org/
8090 pub fn parse ( s : impl AsRef < str > ) -> Result < Self > {
8191 let s = s. as_ref ( ) ;
8292
@@ -86,7 +96,7 @@ impl ListingTableUrl {
8696 }
8797
8898 match Url :: parse ( s) {
89- Ok ( url) => Ok ( Self :: new ( url, None ) ) ,
99+ Ok ( url) => Self :: try_new ( url, None ) ,
90100 Err ( url:: ParseError :: RelativeUrlWithoutBase ) => Self :: parse_path ( s) ,
91101 Err ( e) => Err ( DataFusionError :: External ( Box :: new ( e) ) ) ,
92102 }
@@ -138,15 +148,13 @@ impl ListingTableUrl {
138148 . map_err ( |_| DataFusionError :: Internal ( format ! ( "Can not open path: {s}" ) ) ) ?;
139149 // TODO: Currently we do not have an IO-related error variant that accepts ()
140150 // or a string. Once we have such a variant, change the error type above.
141- Ok ( Self :: new ( url, glob) )
151+ Self :: try_new ( url, glob)
142152 }
143153
144154 /// Creates a new [`ListingTableUrl`] from a url and optional glob expression
145- fn new ( url : Url , glob : Option < Pattern > ) -> Self {
146- let decoded_path =
147- percent_encoding:: percent_decode_str ( url. path ( ) ) . decode_utf8_lossy ( ) ;
148- let prefix = Path :: from ( decoded_path. as_ref ( ) ) ;
149- Self { url, prefix, glob }
155+ fn try_new ( url : Url , glob : Option < Pattern > ) -> Result < Self > {
156+ let prefix = Path :: from_url_path ( url. path ( ) ) ?;
157+ Ok ( Self { url, prefix, glob } )
150158 }
151159
152160 /// Returns the URL scheme
@@ -286,6 +294,7 @@ fn split_glob_expression(path: &str) -> Option<(&str, &str)> {
286294#[ cfg( test) ]
287295mod tests {
288296 use super :: * ;
297+ use tempfile:: tempdir;
289298
290299 #[ test]
291300 fn test_prefix_path ( ) {
@@ -317,8 +326,27 @@ mod tests {
317326 let url = ListingTableUrl :: parse ( "file:///foo/bar?" ) . unwrap ( ) ;
318327 assert_eq ! ( url. prefix. as_ref( ) , "foo/bar" ) ;
319328
320- let url = ListingTableUrl :: parse ( "file:///foo/😺" ) . unwrap ( ) ;
321- assert_eq ! ( url. prefix. as_ref( ) , "foo/%F0%9F%98%BA" ) ;
329+ let err = ListingTableUrl :: parse ( "file:///foo/😺" ) . unwrap_err ( ) ;
330+ assert_eq ! ( err. to_string( ) , "Object Store error: Encountered object with invalid path: Error parsing Path \" /foo/😺\" : Encountered illegal character sequence \" 😺\" whilst parsing path segment \" 😺\" " ) ;
331+
332+ let url = ListingTableUrl :: parse ( "file:///foo/bar%2Efoo" ) . unwrap ( ) ;
333+ assert_eq ! ( url. prefix. as_ref( ) , "foo/bar.foo" ) ;
334+
335+ let url = ListingTableUrl :: parse ( "file:///foo/bar%2Efoo" ) . unwrap ( ) ;
336+ assert_eq ! ( url. prefix. as_ref( ) , "foo/bar.foo" ) ;
337+
338+ let url = ListingTableUrl :: parse ( "file:///foo/bar%252Ffoo" ) . unwrap ( ) ;
339+ assert_eq ! ( url. prefix. as_ref( ) , "foo/bar%2Ffoo" ) ;
340+
341+ let url = ListingTableUrl :: parse ( "file:///foo/a%252Fb.txt" ) . unwrap ( ) ;
342+ assert_eq ! ( url. prefix. as_ref( ) , "foo/a%2Fb.txt" ) ;
343+
344+ let dir = tempdir ( ) . unwrap ( ) ;
345+ let path = dir. path ( ) . join ( "bar%2Ffoo" ) ;
346+ std:: fs:: File :: create ( & path) . unwrap ( ) ;
347+
348+ let url = ListingTableUrl :: parse ( path. to_str ( ) . unwrap ( ) ) . unwrap ( ) ;
349+ assert ! ( url. prefix. as_ref( ) . ends_with( "bar%2Ffoo" ) , "{}" , url. prefix) ;
322350 }
323351
324352 #[ test]
0 commit comments