Made the encoding even more defensive

This commit is contained in:
Erik C. Thauvin 2023-01-04 19:29:04 -08:00
parent e20c096cfe
commit a7b67c39af
6 changed files with 57 additions and 34 deletions

View file

@ -22,22 +22,31 @@ import java.util.BitSet
import kotlin.system.exitProcess
/**
* URL parameters encoding and decoding.
* Most defensive approach to URL encoding and decoding.
*
* - Rules determined by [RFC 3986](https://www.rfc-editor.org/rfc/rfc3986#page-13),
* - Rules determined by combining the unreserved character set from
* [RFC 3986](https://www.rfc-editor.org/rfc/rfc3986#page-13) with the percent-encode set from
* [application/x-www-form-urlencoded](https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set).
*
* @author Geert Bevin (gbevin[remove] at uwyn dot com)
* - Both specs above support percent decoding of two hexadecimal digits to a binary octet, however their unreserved
* set of characters differs and `application/x-www-form-urlencoded` adds conversion of space to `+`, which has the
* potential to be misunderstood.
*
* - This library encodes with rules that will be decoded correctly in either case.
*
* @author Geert Bevin (gbevin(remove) at uwyn dot com)
* @author Erik C. Thauvin (erik@thauvin.net)
*/
**/
object UrlEncoder {
private val hexDigits = "0123456789ABCDEF".toCharArray()
internal val usage =
"Usage : java -jar urlencoder-*all.jar [-ed] text" + System.lineSeparator() +
"Encode and decode URL parameters." + System.lineSeparator() + " -e encode (default) " +
"Encode and decode URL components defensively." + System.lineSeparator() + " -e encode (default) " +
System.lineSeparator() + " -d decode"
// see https://www.rfc-editor.org/rfc/rfc3986#page-13
private val unreservedChars = BitSet('~'.code + 1).apply {
// and https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
private val unreservedChars = BitSet('z'.code + 1).apply {
set('-')
set('.')
for (c in '0'..'9') {
@ -50,14 +59,14 @@ object UrlEncoder {
for (c in 'a'.code..'z'.code) {
set(c)
}
set('~')
}
private fun BitSet.set(c: Char) = this.set(c.code)
// see https://www.rfc-editor.org/rfc/rfc3986#page-13
// and https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
private fun Char.isUnreserved(): Boolean {
return this <= '~' && unreservedChars.get(code)
return this <= 'z' && unreservedChars.get(code)
}
private fun StringBuilder.appendEncodedDigit(digit: Int) {
@ -130,7 +139,7 @@ object UrlEncoder {
* Transforms a provided [String] object into a new string, containing only valid URL characters in the UTF-8
* encoding.
*
* - Letters, numbers, unreserved (`_-!.~'()*`) and allowed characters are left intact.
* - Letters, numbers, unreserved (`_-!.'()*`) and allowed characters are left intact.
*/
@JvmStatic
fun encode(source: String, allow: String): String {
@ -177,7 +186,7 @@ object UrlEncoder {
* Transforms a provided [String] object into a new string, containing only valid URL characters in the UTF-8
* encoding.
*
* - Letters, numbers, unreserved (`_-!.~'()*`) and allowed characters are left intact.
* - Letters, numbers, unreserved (`_-!.'()*`) and allowed characters are left intact.
*/
@JvmStatic
fun encode(source: String, vararg allow: Char): String {
@ -187,7 +196,7 @@ object UrlEncoder {
/**
* Encodes and decodes URLs from the command line.
*
* - `kotlin -cp urlencoder-*.jar net.thauvin.erik.urlencoder.UrlEncoder`
* - `java -jar urlencoder-*all.jar <text>`
*/
@JvmStatic
fun main(args: Array<String>) {
@ -200,7 +209,7 @@ object UrlEncoder {
}
exitProcess(result.status)
} catch (e: IllegalArgumentException) {
System.err.println("${UrlEncoder::class.java.simpleName}: ${e.message}");
System.err.println("${UrlEncoder::class.java.simpleName}: ${e.message}")
exitProcess(1)
}
}

View file

@ -34,7 +34,7 @@ import org.junit.jupiter.params.provider.ValueSource
import java.util.stream.Stream
class UrlEncoderTest {
private val same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~"
private val same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_."
companion object {
@JvmStatic
@ -45,7 +45,7 @@ class UrlEncoderTest {
arguments("a test &", "a%20test%20%26"),
arguments(
"!abcdefghijklmnopqrstuvwxyz%%ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~=",
"%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~%3D"
"%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.%7E%3D"
),
arguments("%#okékÉȢ smile!😁", "%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81"),
arguments(