Made the encoding even more defensive.

This commit is contained in:
Geert Bevin 2023-01-04 21:08:31 -05:00
parent 320be3b3a1
commit b12f3eafd7
3 changed files with 32 additions and 17 deletions

View file

@ -8,21 +8,27 @@
# URL Encoder for Java # URL Encoder for Java
A simple library to encode/decode URL parameters. A simple defensive library to encode/decode URL components.
This library was extracted from the [RIFE2 Web Application Framework](https://rife2.com). This library was extracted from the [RIFE2 Web Application Framework](https://rife2.com).
A Kotlin version can also be found at [https://github.com/ethauvin/urlencoder](https://github.com/ethauvin/urlencoder). A Kotlin version can also be found at [https://github.com/ethauvin/urlencoder](https://github.com/ethauvin/urlencoder).
For decades, we've been using [java.net.URLEncoder](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/net/URLEncoder.html) The rules are determined by combining the unreserved character set from
because of its improper naming. It is actually intended to encode HTML form [RFC 3986](https://www.rfc-editor.org/rfc/rfc3986#page-13) with the
parameters, not URLs, causing the wrong escape sequences to be used. percent-encode set from
[application/x-www-form-urlencoded](https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set).
Additionally, `java.net.URLEncoder` allocates memory even when no encoding is Both specs above support percent decoding of two hexadecimal digits to a
necessary, significantly impacting performance. This library has a negligible binary octet, however their unreserved set of characters differs and
performance impact when a specified string doesn't need to be encoded. `application/x-www-form-urlencoded` adds conversion of space to `+`,
that has the potential to be misunderstood.
Android's [Uri.encode](https://developer.android.com/reference/android/net/Uri#encode(java.lang.String,%20java.lang.String)) This class encodes with rules that will be decoded correctly in either case.
also addresses the same issues.
Additionally, this library allocates no memory when encoding isn't needed and
does the work in a single pass without multiple loops, both of these
optimizations have a significantly beneficial impact on performance of encoding
compared to other solutions like the standard URLEncoder in the JDK.
## Examples (TL;DR) ## Examples (TL;DR)
@ -57,7 +63,7 @@ You have two options:
The usage is as follows: The usage is as follows:
``` ```
Encode and decode URL parameters. Encode and decode URL components defensively.
-e encode (default) -e encode (default)
-d decode -d decode
``` ```

View file

@ -8,9 +8,19 @@ import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
/** /**
* URL encoding and decoding. * Most defensive approach to URL encoding and decoding.
* <p> * <p>
* Rules determined by <a href="https://www.rfc-editor.org/rfc/rfc3986#page-13">RFC 3986</a>. * Rules determined by combining the unreserved character set from
* <a href="https://www.rfc-editor.org/rfc/rfc3986#page-13">RFC 3986</a> with
* the percent-encode set from
* <a href="https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set">application/x-www-form-urlencoded</a>.
* <p>
* Both specs above support percent decoding of two hexadecimal digits to a
* binary octet, however their unreserved set of characters differs and
* {@code application/x-www-form-urlencoded} adds conversion of space to +,
* which has the potential to be misunderstood.
* <p>
* This class encodes with rules that will be decoded correctly in either case.
* *
* @author Geert Bevin (gbevin[remove] at uwyn dot com) * @author Geert Bevin (gbevin[remove] at uwyn dot com)
* @author Erik C. Thauvin (erik@thauvin.net) * @author Erik C. Thauvin (erik@thauvin.net)
@ -22,14 +32,13 @@ public final class UrlEncoder {
static { static {
// see https://www.rfc-editor.org/rfc/rfc3986#page-13 // see https://www.rfc-editor.org/rfc/rfc3986#page-13
var unreserved = new BitSet('~' + 1); var unreserved = new BitSet('z' + 1);
unreserved.set('-'); unreserved.set('-');
unreserved.set('.'); unreserved.set('.');
for (int c = '0'; c <= '9'; ++c) unreserved.set(c); for (int c = '0'; c <= '9'; ++c) unreserved.set(c);
for (int c = 'A'; c <= 'Z'; ++c) unreserved.set(c); for (int c = 'A'; c <= 'Z'; ++c) unreserved.set(c);
unreserved.set('_'); unreserved.set('_');
for (int c = 'a'; c <= 'z'; ++c) unreserved.set(c); for (int c = 'a'; c <= 'z'; ++c) unreserved.set(c);
unreserved.set('~');
UNRESERVED_URI_CHARS = unreserved; UNRESERVED_URI_CHARS = unreserved;
} }
@ -216,7 +225,7 @@ public final class UrlEncoder {
// see https://www.rfc-editor.org/rfc/rfc3986#page-13 // see https://www.rfc-editor.org/rfc/rfc3986#page-13
private static boolean isUnreservedUriChar(char ch) { private static boolean isUnreservedUriChar(char ch) {
return ch <= '~' && UNRESERVED_URI_CHARS.get(ch); return ch <= 'z' && UNRESERVED_URI_CHARS.get(ch);
} }
static class MainResult { static class MainResult {

View file

@ -16,7 +16,7 @@ import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.params.provider.Arguments.arguments; import static org.junit.jupiter.params.provider.Arguments.arguments;
class UrlEncoderTest { class UrlEncoderTest {
private final String same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~"; private final String same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.";
private static Stream<String> invalid() { private static Stream<String> invalid() {
return Stream.of("sdkjfh%", "sdkjfh%6", "sdkjfh%xx", "sdfjfh%-1"); return Stream.of("sdkjfh%", "sdkjfh%6", "sdkjfh%xx", "sdfjfh%-1");
@ -27,7 +27,7 @@ class UrlEncoderTest {
arguments("a test &", "a%20test%20%26"), arguments("a test &", "a%20test%20%26"),
arguments( arguments(
"!abcdefghijklmnopqrstuvwxyz%%ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~=", "!abcdefghijklmnopqrstuvwxyz%%ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~=",
"%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~%3D" "%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.%7E%3D"
), ),
arguments("%#okékÉȢ smile!😁", "%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81"), arguments("%#okékÉȢ smile!😁", "%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81"),
arguments( arguments(