- * Rules determined by RFC 3986. + * Rules determined by combining the unreserved character set from + * RFC 3986 with + * the percent-encode set from + * application/x-www-form-urlencoded. + *
+ * Both specs above support percent decoding of two hexadecimal digits to a + * binary octet, however their unreserved set of characters differs and + * {@code application/x-www-form-urlencoded} adds conversion of space to +, + * which has the potential to be misunderstood. + *
+ * This class encodes with rules that will be decoded correctly in either case.
*
* @author Geert Bevin (gbevin[remove] at uwyn dot com)
* @author Erik C. Thauvin (erik@thauvin.net)
@@ -22,17 +34,21 @@ public final class UrlEncoder {
static {
// see https://www.rfc-editor.org/rfc/rfc3986#page-13
- var unreserved = new BitSet('~' + 1);
+ // and https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
+ var unreserved = new BitSet('z' + 1);
unreserved.set('-');
unreserved.set('.');
for (int c = '0'; c <= '9'; ++c) unreserved.set(c);
for (int c = 'A'; c <= 'Z'; ++c) unreserved.set(c);
unreserved.set('_');
for (int c = 'a'; c <= 'z'; ++c) unreserved.set(c);
- unreserved.set('~');
UNRESERVED_URI_CHARS = unreserved;
}
+ private UrlEncoder() {
+ // no-op
+ }
+
private static void appendUrlEncodedByte(StringBuilder out, int ch) {
out.append("%");
appendUrlEncodedDigit(out, ch >> 4);
@@ -43,10 +59,6 @@ public final class UrlEncoder {
out.append(HEX_DIGITS[digit & 0x0F]);
}
- private UrlEncoder() {
- // no-op
- }
-
/**
* Transforms a provided String
URL into a new string,
* containing decoded URL characters in the UTF-8 encoding.
@@ -57,7 +69,21 @@ public final class UrlEncoder {
* @since 1.0
*/
public static String decode(String source) {
- if (source == null || source.isBlank()) {
+ return decode(source, false);
+ }
+
+ /**
+ * Transforms a provided String
URL into a new string,
+ * containing decoded URL characters in the UTF-8 encoding.
+ *
+ * @param source The string URL that has to be decoded
+ * @param plusToSpace Convert any {@code +} to space.
+ * @return The decoded String
object.
+ * @see #encode(String, String)
+ * @since 1.0
+ */
+ public static String decode(String source, boolean plusToSpace) {
+ if (source == null || source.isEmpty()) {
return source;
}
@@ -67,14 +93,11 @@ public final class UrlEncoder {
byte[] bytes_buffer = null;
var bytes_pos = 0;
var i = 0;
- while(i < length) {
+ while (i < length) {
ch = source.charAt(i);
if (ch == '%') {
- if (out == null) {
- out = new StringBuilder(length);
- out.append(source, 0, i);
- }
+ out = startConstructingIfNeeded(out, source, i);
if (bytes_buffer == null) {
// the remaining characters divided by the length
@@ -97,7 +120,7 @@ public final class UrlEncoder {
i += 2;
} catch (NumberFormatException e) {
- throw new IllegalArgumentException("Illegal characters in escape sequence: " + e.getMessage());
+ throw new IllegalArgumentException("Illegal characters in escape sequence: " + e.getMessage(), e);
}
} else {
if (bytes_buffer != null) {
@@ -107,7 +130,10 @@ public final class UrlEncoder {
bytes_pos = 0;
}
- if (out != null) {
+ if (plusToSpace && ch == '+') {
+ out = startConstructingIfNeeded(out, source, i);
+ out.append(" ");
+ } else if (out != null) {
out.append(ch);
}
@@ -126,6 +152,14 @@ public final class UrlEncoder {
return out.toString();
}
+ private static StringBuilder startConstructingIfNeeded(StringBuilder out, String source, int currentSourcePosition) {
+ if (out == null) {
+ out = new StringBuilder(source.length());
+ out.append(source, 0, currentSourcePosition);
+ }
+ return out;
+ }
+
/**
* Transforms a provided String
object into a new string,
* containing only valid URL characters in the UTF-8 encoding.
@@ -137,22 +171,7 @@ public final class UrlEncoder {
* @since 1.0
*/
public static String encode(String source) {
- return encode(source, (String)null);
- }
-
- /**
- * Transforms a provided String
object into a new string,
- * containing only valid URL characters in the UTF-8 encoding.
- *
- * @param source The string that has to be transformed into a valid URL
- * string.
- * @param allow Additional characters to allow.
- * @return The encoded String
object.
- * @see #decode(String)
- * @since 1.0
- */
- public static String encode(String source, char... allow) {
- return encode(source, new String(allow));
+ return encode(source, null, false);
}
/**
@@ -167,14 +186,45 @@ public final class UrlEncoder {
* @since 1.0
*/
public static String encode(String source, String allow) {
- if (source == null || source.isBlank()) {
+ return encode(source, allow, false);
+ }
+
+ /**
+ * Transforms a provided String
object into a new string,
+ * containing only valid URL characters in the UTF-8 encoding.
+ *
+ * @param source The string that has to be transformed into a valid URL
+ * string.
+ * @param spaceToPlus Convert any space to {@code +}.
+ * @return The encoded String
object.
+ * @see #decode(String)
+ * @since 1.0
+ */
+ public static String encode(String source, boolean spaceToPlus) {
+ return encode(source, null, spaceToPlus);
+ }
+
+ /**
+ * Transforms a provided String
object into a new string,
+ * containing only valid URL characters in the UTF-8 encoding.
+ *
+ * @param source The string that has to be transformed into a valid URL
+ * string.
+ * @param allow Additional characters to allow.
+ * @param spaceToPlus Convert any space to {@code +}.
+ * @return The encoded String
object.
+ * @see #decode(String)
+ * @since 1.0
+ */
+ public static String encode(String source, String allow, boolean spaceToPlus) {
+ if (source == null || source.isEmpty()) {
return source;
}
StringBuilder out = null;
char ch;
var i = 0;
- while(i < source.length()) {
+ while (i < source.length()) {
ch = source.charAt(i);
if (isUnreservedUriChar(ch) || (allow != null && allow.indexOf(ch) != -1)) {
if (out != null) {
@@ -182,14 +232,15 @@ public final class UrlEncoder {
}
i += 1;
} else {
- if (out == null) {
- out = new StringBuilder(source.length());
- out.append(source, 0, i);
- }
+ out = startConstructingIfNeeded(out, source, i);
var cp = source.codePointAt(i);
if (cp < 0x80) {
- appendUrlEncodedByte(out, cp);
+ if (spaceToPlus && ch == ' ') {
+ out.append('+');
+ } else {
+ appendUrlEncodedByte(out, cp);
+ }
i += 1;
} else if (Character.isBmpCodePoint(cp)) {
for (var b : Character.toString(ch).getBytes(StandardCharsets.UTF_8)) {
@@ -215,7 +266,74 @@ public final class UrlEncoder {
}
// see https://www.rfc-editor.org/rfc/rfc3986#page-13
+ // and https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
private static boolean isUnreservedUriChar(char ch) {
- return ch <= '~' && UNRESERVED_URI_CHARS.get(ch);
+ return ch <= 'z' && UNRESERVED_URI_CHARS.get(ch);
+ }
+
+ /**
+ * Main method to encode/decode URLs on the command line
+ *
+ * @param arguments the command line arguments
+ * @since 1.1
+ */
+ public static void main(String[] arguments) {
+ try {
+ var result = processMain(arguments);
+ if (result.status == 0) {
+ System.out.println(result.output);
+ } else {
+ System.err.println(result.output);
+ }
+ System.exit(result.status);
+ } catch (IllegalArgumentException e) {
+ System.err.println(UrlEncoder.class.getSimpleName() + ": " + e.getMessage());
+ System.exit(1);
+ }
+ }
+
+ static MainResult processMain(String... arguments) {
+ var valid_arguments = false;
+ var perform_decode = false;
+ var args = new ArrayList<>(List.of(arguments));
+ if (!args.isEmpty() && args.get(0).startsWith("-")) {
+ var option = args.remove(0);
+ if (("-d").equals(option)) {
+ perform_decode = true;
+ valid_arguments = (args.size() == 1);
+ } else if (("-e").equals(option)) {
+ valid_arguments = (args.size() == 1);
+ } else {
+ args.clear();
+ }
+ }
+
+ var text = "";
+ if (args.size() == 1 && !args.get(0).isEmpty()) {
+ text = args.remove(0);
+ valid_arguments = true;
+ }
+
+ if (!valid_arguments) {
+ return new MainResult("Usage : java -jar urlencoder-*.jar [-ed] text" + System.lineSeparator() +
+ "Encode and decode URL components defensively." + System.lineSeparator() +
+ " -e encode (default)" + System.lineSeparator() +
+ " -d decode", 1);
+ }
+ if (perform_decode) {
+ return new MainResult(UrlEncoder.decode(text), 0);
+ } else {
+ return new MainResult(UrlEncoder.encode(text), 0);
+ }
+ }
+
+ static class MainResult {
+ final String output;
+ final int status;
+
+ public MainResult(String output, int status) {
+ this.output = output;
+ this.status = status;
+ }
}
}
diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java
new file mode 100644
index 0000000..a0a140e
--- /dev/null
+++ b/src/main/java/module-info.java
@@ -0,0 +1,3 @@
+module com.uwyn.urlencoder {
+ exports com.uwyn.urlencoder;
+}
diff --git a/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java b/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java
new file mode 100644
index 0000000..be7408d
--- /dev/null
+++ b/src/test/java/com/uwyn/urlencoder/UrlEncoderTest.java
@@ -0,0 +1,169 @@
+/*
+ * Copyright 2001-2023 Geert Bevin (gbevin[remove] at uwyn dot com)
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ */
+package com.uwyn.urlencoder;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import java.util.stream.Stream;
+
+import static org.junit.jupiter.api.Assertions.*;
+import static org.junit.jupiter.params.provider.Arguments.arguments;
+
+class UrlEncoderTest {
+ private final String same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.";
+
+ private static Stream