diff --git a/README.md b/README.md index 84f4da9..6da4af9 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,27 @@ # URL Encoder for Java -A simple library to encode/decode URL parameters. +A simple defensive library to encode/decode URL components. This library was extracted from the [RIFE2 Web Application Framework](https://rife2.com). A Kotlin version can also be found at [https://github.com/ethauvin/urlencoder](https://github.com/ethauvin/urlencoder). -For decades, we've been using [java.net.URLEncoder](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/net/URLEncoder.html) -because of its improper naming. It is actually intended to encode HTML form -parameters, not URLs, causing the wrong escape sequences to be used. +The rules are determined by combining the unreserved character set from +[RFC 3986](https://www.rfc-editor.org/rfc/rfc3986#page-13) with the +percent-encode set from +[application/x-www-form-urlencoded](https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set). -Additionally, `java.net.URLEncoder` allocates memory even when no encoding is -necessary, significantly impacting performance. This library has a negligible -performance impact when a specified string doesn't need to be encoded. +Both specs above support percent decoding of two hexadecimal digits to a +binary octet, however their unreserved set of characters differs and +`application/x-www-form-urlencoded` adds conversion of space to `+`, +that has the potential to be misunderstood. -Android's [Uri.encode](https://developer.android.com/reference/android/net/Uri#encode(java.lang.String,%20java.lang.String)) -also addresses the same issues. +This class encodes with rules that will be decoded correctly in either case. + +Additionally, this library allocates no memory when encoding isn't needed and +does the work in a single pass without multiple loops, both of these +optimizations have a significantly beneficial impact on performance of encoding +compared to other solutions like the standard URLEncoder in the JDK. ## Examples (TL;DR) @@ -57,7 +63,7 @@ You have two options: The usage is as follows: ``` -Encode and decode URL parameters. +Encode and decode URL components defensively. -e encode (default) -d decode ``` diff --git a/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java b/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java index 7a2de08..8e736f3 100644 --- a/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java +++ b/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java @@ -8,9 +8,19 @@ import java.nio.charset.StandardCharsets; import java.util.*; /** - * URL encoding and decoding. + * Most defensive approach to URL encoding and decoding. *

- * Rules determined by RFC 3986. + * Rules determined by combining the unreserved character set from + * RFC 3986 with + * the percent-encode set from + * application/x-www-form-urlencoded. + *

+ * Both specs above support percent decoding of two hexadecimal digits to a + * binary octet, however their unreserved set of characters differs and + * {@code application/x-www-form-urlencoded} adds conversion of space to +, + * which has the potential to be misunderstood. + *

+ * This class encodes with rules that will be decoded correctly in either case. * * @author Geert Bevin (gbevin[remove] at uwyn dot com) * @author Erik C. Thauvin (erik@thauvin.net) @@ -22,14 +32,13 @@ public final class UrlEncoder { static { // see https://www.rfc-editor.org/rfc/rfc3986#page-13 - var unreserved = new BitSet('~' + 1); + var unreserved = new BitSet('z' + 1); unreserved.set('-'); unreserved.set('.'); for (int c = '0'; c <= '9'; ++c) unreserved.set(c); for (int c = 'A'; c <= 'Z'; ++c) unreserved.set(c); unreserved.set('_'); for (int c = 'a'; c <= 'z'; ++c) unreserved.set(c); - unreserved.set('~'); UNRESERVED_URI_CHARS = unreserved; } @@ -216,7 +225,7 @@ public final class UrlEncoder { // see https://www.rfc-editor.org/rfc/rfc3986#page-13 private static boolean isUnreservedUriChar(char ch) { - return ch <= '~' && UNRESERVED_URI_CHARS.get(ch); + return ch <= 'z' && UNRESERVED_URI_CHARS.get(ch); } static class MainResult { diff --git a/lib/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java b/lib/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java index ecf6624..99cf385 100644 --- a/lib/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java +++ b/lib/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java @@ -16,7 +16,7 @@ import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.params.provider.Arguments.arguments; class UrlEncoderTest { - private final String same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~"; + private final String same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_."; private static Stream invalid() { return Stream.of("sdkjfh%", "sdkjfh%6", "sdkjfh%xx", "sdfjfh%-1"); @@ -27,7 +27,7 @@ class UrlEncoderTest { arguments("a test &", "a%20test%20%26"), arguments( "!abcdefghijklmnopqrstuvwxyz%%ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~=", - "%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~%3D" + "%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.%7E%3D" ), arguments("%#okékÉȢ smile!😁", "%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81"), arguments(