From b12f3eafd7d2deb6b0e9d33736607eb8e02ea51e Mon Sep 17 00:00:00 2001 From: Geert Bevin Date: Wed, 4 Jan 2023 21:08:31 -0500 Subject: [PATCH 1/8] Made the encoding even more defensive. --- README.md | 26 ++++++++++++------- .../java/com/uwyn/urlencoder/UrlEncoder.java | 19 ++++++++++---- .../com/uwyn/urlencoder/UrlEncoderTest.java | 4 +-- 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 84f4da9..6da4af9 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,27 @@ # URL Encoder for Java -A simple library to encode/decode URL parameters. +A simple defensive library to encode/decode URL components. This library was extracted from the [RIFE2 Web Application Framework](https://rife2.com). A Kotlin version can also be found at [https://github.com/ethauvin/urlencoder](https://github.com/ethauvin/urlencoder). -For decades, we've been using [java.net.URLEncoder](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/net/URLEncoder.html) -because of its improper naming. It is actually intended to encode HTML form -parameters, not URLs, causing the wrong escape sequences to be used. +The rules are determined by combining the unreserved character set from +[RFC 3986](https://www.rfc-editor.org/rfc/rfc3986#page-13) with the +percent-encode set from +[application/x-www-form-urlencoded](https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set). -Additionally, `java.net.URLEncoder` allocates memory even when no encoding is -necessary, significantly impacting performance. This library has a negligible -performance impact when a specified string doesn't need to be encoded. +Both specs above support percent decoding of two hexadecimal digits to a +binary octet, however their unreserved set of characters differs and +`application/x-www-form-urlencoded` adds conversion of space to `+`, +that has the potential to be misunderstood. -Android's [Uri.encode](https://developer.android.com/reference/android/net/Uri#encode(java.lang.String,%20java.lang.String)) -also addresses the same issues. +This class encodes with rules that will be decoded correctly in either case. + +Additionally, this library allocates no memory when encoding isn't needed and +does the work in a single pass without multiple loops, both of these +optimizations have a significantly beneficial impact on performance of encoding +compared to other solutions like the standard URLEncoder in the JDK. ## Examples (TL;DR) @@ -57,7 +63,7 @@ You have two options: The usage is as follows: ``` -Encode and decode URL parameters. +Encode and decode URL components defensively. -e encode (default) -d decode ``` diff --git a/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java b/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java index 7a2de08..8e736f3 100644 --- a/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java +++ b/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java @@ -8,9 +8,19 @@ import java.nio.charset.StandardCharsets; import java.util.*; /** - * URL encoding and decoding. + * Most defensive approach to URL encoding and decoding. *

- * Rules determined by RFC 3986. + * Rules determined by combining the unreserved character set from + * RFC 3986 with + * the percent-encode set from + * application/x-www-form-urlencoded. + *

+ * Both specs above support percent decoding of two hexadecimal digits to a + * binary octet, however their unreserved set of characters differs and + * {@code application/x-www-form-urlencoded} adds conversion of space to +, + * which has the potential to be misunderstood. + *

+ * This class encodes with rules that will be decoded correctly in either case. * * @author Geert Bevin (gbevin[remove] at uwyn dot com) * @author Erik C. Thauvin (erik@thauvin.net) @@ -22,14 +32,13 @@ public final class UrlEncoder { static { // see https://www.rfc-editor.org/rfc/rfc3986#page-13 - var unreserved = new BitSet('~' + 1); + var unreserved = new BitSet('z' + 1); unreserved.set('-'); unreserved.set('.'); for (int c = '0'; c <= '9'; ++c) unreserved.set(c); for (int c = 'A'; c <= 'Z'; ++c) unreserved.set(c); unreserved.set('_'); for (int c = 'a'; c <= 'z'; ++c) unreserved.set(c); - unreserved.set('~'); UNRESERVED_URI_CHARS = unreserved; } @@ -216,7 +225,7 @@ public final class UrlEncoder { // see https://www.rfc-editor.org/rfc/rfc3986#page-13 private static boolean isUnreservedUriChar(char ch) { - return ch <= '~' && UNRESERVED_URI_CHARS.get(ch); + return ch <= 'z' && UNRESERVED_URI_CHARS.get(ch); } static class MainResult { diff --git a/lib/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java b/lib/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java index ecf6624..99cf385 100644 --- a/lib/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java +++ b/lib/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java @@ -16,7 +16,7 @@ import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.params.provider.Arguments.arguments; class UrlEncoderTest { - private final String same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~"; + private final String same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_."; private static Stream invalid() { return Stream.of("sdkjfh%", "sdkjfh%6", "sdkjfh%xx", "sdfjfh%-1"); @@ -27,7 +27,7 @@ class UrlEncoderTest { arguments("a test &", "a%20test%20%26"), arguments( "!abcdefghijklmnopqrstuvwxyz%%ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~=", - "%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~%3D" + "%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.%7E%3D" ), arguments("%#okékÉȢ smile!😁", "%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81"), arguments( From 954aa3e92edecf015c0e3d316f0542845aa1f49d Mon Sep 17 00:00:00 2001 From: Geert Bevin Date: Wed, 4 Jan 2023 21:15:32 -0500 Subject: [PATCH 2/8] Minor clarifications --- lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java b/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java index 8e736f3..0c53377 100644 --- a/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java +++ b/lib/src/main/java/com/uwyn/urlencoder/UrlEncoder.java @@ -32,6 +32,7 @@ public final class UrlEncoder { static { // see https://www.rfc-editor.org/rfc/rfc3986#page-13 + // and https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set var unreserved = new BitSet('z' + 1); unreserved.set('-'); unreserved.set('.'); @@ -224,6 +225,7 @@ public final class UrlEncoder { } // see https://www.rfc-editor.org/rfc/rfc3986#page-13 + // and https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set private static boolean isUnreservedUriChar(char ch) { return ch <= 'z' && UNRESERVED_URI_CHARS.get(ch); } @@ -262,7 +264,7 @@ public final class UrlEncoder { if (!valid_arguments) { return new MainResult("Usage : java -jar urlencoder-*.jar [-ed] text" + System.lineSeparator() + - "Encode and decode URL parameters." + System.lineSeparator() + + "Encode and decode URL components defensively." + System.lineSeparator() + " -e encode (default)" + System.lineSeparator() + " -d decode", 1); } From a02e617f5bd11ed0fc34ab8175469bb283efe37e Mon Sep 17 00:00:00 2001 From: Geert Bevin Date: Wed, 4 Jan 2023 21:16:36 -0500 Subject: [PATCH 3/8] Updated version to 1.2.0-SNAPSHOT --- lib/build.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/build.gradle.kts b/lib/build.gradle.kts index 04e2a21..86570bf 100644 --- a/lib/build.gradle.kts +++ b/lib/build.gradle.kts @@ -13,7 +13,7 @@ plugins { } group = "com.uwyn" -version = "1.1.0" +version = "1.2.0-SNAPSHOT" val mavenName = "UrlEncoder" val javaMainClass = "$group.${rootProject.name}.$mavenName" From 9c3e68bb4461c3daf05e3a0bc244a0209e669a1f Mon Sep 17 00:00:00 2001 From: "Erik C. Thauvin" Date: Wed, 4 Jan 2023 19:19:15 -0800 Subject: [PATCH 4/8] Cleaned up description --- lib/build.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/build.gradle.kts b/lib/build.gradle.kts index 86570bf..f7c20e6 100644 --- a/lib/build.gradle.kts +++ b/lib/build.gradle.kts @@ -112,7 +112,7 @@ publishing { from(components["java"]) pom { name.set(mavenName) - description.set("A simple library to encode/decode URL parameters") + description.set("A simple defensive library to encode/decode URL components") url.set("https://github.com/gbevin/urlencoder") licenses { license { From f84caca6b2d75f88da561dc953e1d44e55065c79 Mon Sep 17 00:00:00 2001 From: "Erik C. Thauvin" Date: Wed, 4 Jan 2023 19:23:47 -0800 Subject: [PATCH 5/8] Minor cleanup --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6da4af9..8e3ac12 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ that has the potential to be misunderstood. This class encodes with rules that will be decoded correctly in either case. Additionally, this library allocates no memory when encoding isn't needed and -does the work in a single pass without multiple loops, both of these +does the work in a single pass without multiple loops. Both of these optimizations have a significantly beneficial impact on performance of encoding compared to other solutions like the standard URLEncoder in the JDK. From 5c2881c8454b61c5cd572fbaecd476745fa566d2 Mon Sep 17 00:00:00 2001 From: "Erik C. Thauvin" Date: Wed, 4 Jan 2023 19:24:28 -0800 Subject: [PATCH 6/8] Minor cleanup --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8e3ac12..64a2036 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ This class encodes with rules that will be decoded correctly in either case. Additionally, this library allocates no memory when encoding isn't needed and does the work in a single pass without multiple loops. Both of these optimizations have a significantly beneficial impact on performance of encoding -compared to other solutions like the standard URLEncoder in the JDK. +compared to other solutions like the standard `URLEncoder` in the JDK. ## Examples (TL;DR) From 6bebfc3cabff98574a779592ca5d0822ebb18037 Mon Sep 17 00:00:00 2001 From: Geert Bevin Date: Wed, 4 Jan 2023 22:37:44 -0500 Subject: [PATCH 7/8] Minor readme change --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 64a2036..c5d2660 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,8 @@ This class encodes with rules that will be decoded correctly in either case. Additionally, this library allocates no memory when encoding isn't needed and does the work in a single pass without multiple loops. Both of these optimizations have a significantly beneficial impact on performance of encoding -compared to other solutions like the standard `URLEncoder` in the JDK. +compared to other solutions like the standard `URLEncoder` in the JDK or +`UriUtils` in Spring. ## Examples (TL;DR) From 2409932ac5195f45aa2f7700072072d5e868cda9 Mon Sep 17 00:00:00 2001 From: Geert Bevin Date: Wed, 4 Jan 2023 22:52:35 -0500 Subject: [PATCH 8/8] Updated version to 1.2.0 --- lib/build.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/build.gradle.kts b/lib/build.gradle.kts index f7c20e6..87a3442 100644 --- a/lib/build.gradle.kts +++ b/lib/build.gradle.kts @@ -13,7 +13,7 @@ plugins { } group = "com.uwyn" -version = "1.2.0-SNAPSHOT" +version = "1.2.0" val mavenName = "UrlEncoder" val javaMainClass = "$group.${rootProject.name}.$mavenName"