Made the encoding even more defensive.

This commit is contained in:
Geert Bevin 2023-01-04 21:08:31 -05:00
parent 320be3b3a1
commit b12f3eafd7
3 changed files with 32 additions and 17 deletions

View file

@ -8,9 +8,19 @@ import java.nio.charset.StandardCharsets;
import java.util.*;
/**
* URL encoding and decoding.
* Most defensive approach to URL encoding and decoding.
* <p>
* Rules determined by <a href="https://www.rfc-editor.org/rfc/rfc3986#page-13">RFC 3986</a>.
* Rules determined by combining the unreserved character set from
* <a href="https://www.rfc-editor.org/rfc/rfc3986#page-13">RFC 3986</a> with
* the percent-encode set from
* <a href="https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set">application/x-www-form-urlencoded</a>.
* <p>
* Both specs above support percent decoding of two hexadecimal digits to a
* binary octet, however their unreserved set of characters differs and
* {@code application/x-www-form-urlencoded} adds conversion of space to +,
* which has the potential to be misunderstood.
* <p>
* This class encodes with rules that will be decoded correctly in either case.
*
* @author Geert Bevin (gbevin[remove] at uwyn dot com)
* @author Erik C. Thauvin (erik@thauvin.net)
@ -22,14 +32,13 @@ public final class UrlEncoder {
static {
// see https://www.rfc-editor.org/rfc/rfc3986#page-13
var unreserved = new BitSet('~' + 1);
var unreserved = new BitSet('z' + 1);
unreserved.set('-');
unreserved.set('.');
for (int c = '0'; c <= '9'; ++c) unreserved.set(c);
for (int c = 'A'; c <= 'Z'; ++c) unreserved.set(c);
unreserved.set('_');
for (int c = 'a'; c <= 'z'; ++c) unreserved.set(c);
unreserved.set('~');
UNRESERVED_URI_CHARS = unreserved;
}
@ -216,7 +225,7 @@ public final class UrlEncoder {
// see https://www.rfc-editor.org/rfc/rfc3986#page-13
private static boolean isUnreservedUriChar(char ch) {
return ch <= '~' && UNRESERVED_URI_CHARS.get(ch);
return ch <= 'z' && UNRESERVED_URI_CHARS.get(ch);
}
static class MainResult {

View file

@ -16,7 +16,7 @@ import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.params.provider.Arguments.arguments;
class UrlEncoderTest {
private final String same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~";
private final String same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.";
private static Stream<String> invalid() {
return Stream.of("sdkjfh%", "sdkjfh%6", "sdkjfh%xx", "sdfjfh%-1");
@ -27,7 +27,7 @@ class UrlEncoderTest {
arguments("a test &", "a%20test%20%26"),
arguments(
"!abcdefghijklmnopqrstuvwxyz%%ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~=",
"%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~%3D"
"%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.%7E%3D"
),
arguments("%#okékÉȢ smile!😁", "%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81"),
arguments(