Made the encoding even more defensive

This commit is contained in:
Erik C. Thauvin 2023-01-04 19:29:04 -08:00
parent e20c096cfe
commit a7b67c39af
6 changed files with 57 additions and 34 deletions

View file

@ -9,18 +9,27 @@
# URL Encoder for Kotlin # URL Encoder for Kotlin
A simple library to encode/decode URL parameters. A simple defensive library to encode/decode URL components.
This library was adapted from the [RIFE2 Web Application Framework](https://rife2.com). This library was adapted from the [RIFE2 Web Application Framework](https://rife2.com).
A pure Java version can also be found at [https://github.com/gbevin/urlencoder](https://github.com/gbevin/urlencoder). A pure Java version can also be found at [https://github.com/gbevin/urlencoder](https://github.com/gbevin/urlencoder).
The rules are determined by combining the unreserved character set from
[RFC 3986](https://www.rfc-editor.org/rfc/rfc3986#page-13) with the
percent-encode set from
[application/x-www-form-urlencoded](https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set).
For decades we've been using [java.net.URLEncoder](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/net/URLEncoder.html) because of its improper naming. It is actually intended to encode HTML form parameters, not URLs, causing the wrong escape sequences to be used. Both specs above support percent decoding of two hexadecimal digits to a
binary octet, however their unreserved set of characters differs and
`application/x-www-form-urlencoded` adds conversion of space to `+`,
that has the potential to be misunderstood.
Additionally, `java.net.URLEncoder` allocates memory even when no encoding is necessary, significantly impacting performance. This library has a negligible performance impact when the specified string doesn't need to be encoded. This class encodes with rules that will be decoded correctly in either case.
Additionally, this library allocates no memory when encoding isn't needed and
Android's [Uri.encode](https://developer.android.com/reference/android/net/Uri#encode(java.lang.String,%20java.lang.String)) also addresses the same issues. does the work in a single pass without multiple loops. Both of these
optimizations have a significantly beneficial impact on performance of encoding
compared to other solutions like the standard `URLEncoder` in the JDK.
## Examples (TL;DR) ## Examples (TL;DR)
@ -34,6 +43,7 @@ UrlEncoder.decode("%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81") // -> %#
``` ```
## Gradle, Maven, etc. ## Gradle, Maven, etc.
To use with [Gradle](https://gradle.org/), include the following dependency in your build file: To use with [Gradle](https://gradle.org/), include the following dependency in your build file:
```gradle ```gradle
@ -47,13 +57,15 @@ dependencies {
} }
``` ```
Instructions for using with Maven, Ivy, etc. can be found on [Maven Central](https://maven-badges.herokuapp.com/maven-central/net.thauvin.erik/urlencoder). Instructions for using with Maven, Ivy, etc. can be found
on [Maven Central](https://maven-badges.herokuapp.com/maven-central/net.thauvin.erik/urlencoder).
## Standalone usage ## Standalone usage
UrlEncoder can be used on the command line also, both for encoding and decoding. UrlEncoder can be used on the command line also, both for encoding and decoding.
You have two options: You have two options:
* run it with Gradle * run it with Gradle
* build the jar and launch it with Java * build the jar and launch it with Java

View file

@ -21,7 +21,7 @@ plugins {
id("signing") id("signing")
} }
description = "A simple library to encode/decode URL parameters" description = "A simple defensive library to encode/decode URL components"
group = "net.thauvin.erik" group = "net.thauvin.erik"
version = "1.0.1-SNAPSHOT" version = "1.0.1-SNAPSHOT"
@ -193,7 +193,7 @@ publishing {
artifactId = rootProject.name artifactId = rootProject.name
artifact(javadocJar) artifact(javadocJar)
pom { pom {
name.set(mavenName) name.set("$mavenName for Kotlin")
description.set(project.description) description.set(project.description)
url.set(mavenUrl) url.set(mavenUrl)
licenses { licenses {

View file

@ -1,13 +1,15 @@
<?xml version='1.0' encoding='UTF-8'?> <?xml version='1.0' encoding='UTF-8'?>
<SmellBaseline> <SmellBaseline>
<ManuallySuppressedIssues/> <ManuallySuppressedIssues/>
<CurrentIssues> <CurrentIssues>
<ID>ComplexCondition:UrlEncoder.kt$UrlEncoder$hasOption &amp;&amp; args.size == 2 || !hasOption &amp;&amp; args.size == 1</ID> <ID>ComplexCondition:UrlEncoder.kt$UrlEncoder$hasOption &amp;&amp; args.size == 2 || !hasOption &amp;&amp;
<ID>MagicNumber:UrlEncoder.kt$UrlEncoder$0x80</ID> args.size == 1
<ID>MagicNumber:UrlEncoder.kt$UrlEncoder$0xFF</ID> </ID>
<ID>MagicNumber:UrlEncoder.kt$UrlEncoder$16</ID> <ID>MagicNumber:UrlEncoder.kt$UrlEncoder$0x80</ID>
<ID>MagicNumber:UrlEncoder.kt$UrlEncoder$3</ID> <ID>MagicNumber:UrlEncoder.kt$UrlEncoder$0xFF</ID>
<ID>MagicNumber:UrlEncoder.kt$UrlEncoder$4</ID> <ID>MagicNumber:UrlEncoder.kt$UrlEncoder$16</ID>
<ID>NestedBlockDepth:UrlEncoder.kt$UrlEncoder$@JvmStatic fun encode(source: String, allow: String): String</ID> <ID>MagicNumber:UrlEncoder.kt$UrlEncoder$3</ID>
</CurrentIssues> <ID>MagicNumber:UrlEncoder.kt$UrlEncoder$4</ID>
<ID>NestedBlockDepth:UrlEncoder.kt$UrlEncoder$@JvmStatic fun encode(source: String, allow: String): String</ID>
</CurrentIssues>
</SmellBaseline> </SmellBaseline>

View file

@ -9,8 +9,8 @@
<groupId>net.thauvin.erik</groupId> <groupId>net.thauvin.erik</groupId>
<artifactId>urlencoder</artifactId> <artifactId>urlencoder</artifactId>
<version>1.0.1-SNAPSHOT</version> <version>1.0.1-SNAPSHOT</version>
<name>UrlEncoder</name> <name>UrlEncoder for Kotlin</name>
<description>A simple library to encode/decode URL parameters</description> <description>A simple defensive library to encode/decode URL components</description>
<url>https://github.com/ethauvin/urlencoder</url> <url>https://github.com/ethauvin/urlencoder</url>
<licenses> <licenses>
<license> <license>

View file

@ -22,22 +22,31 @@ import java.util.BitSet
import kotlin.system.exitProcess import kotlin.system.exitProcess
/** /**
* URL parameters encoding and decoding. * Most defensive approach to URL encoding and decoding.
* *
* - Rules determined by [RFC 3986](https://www.rfc-editor.org/rfc/rfc3986#page-13), * - Rules determined by combining the unreserved character set from
* [RFC 3986](https://www.rfc-editor.org/rfc/rfc3986#page-13) with the percent-encode set from
* [application/x-www-form-urlencoded](https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set).
* *
* @author Geert Bevin (gbevin[remove] at uwyn dot com) * - Both specs above support percent decoding of two hexadecimal digits to a binary octet, however their unreserved
* set of characters differs and `application/x-www-form-urlencoded` adds conversion of space to `+`, which has the
* potential to be misunderstood.
*
* - This library encodes with rules that will be decoded correctly in either case.
*
* @author Geert Bevin (gbevin(remove) at uwyn dot com)
* @author Erik C. Thauvin (erik@thauvin.net) * @author Erik C. Thauvin (erik@thauvin.net)
*/ **/
object UrlEncoder { object UrlEncoder {
private val hexDigits = "0123456789ABCDEF".toCharArray() private val hexDigits = "0123456789ABCDEF".toCharArray()
internal val usage = internal val usage =
"Usage : java -jar urlencoder-*all.jar [-ed] text" + System.lineSeparator() + "Usage : java -jar urlencoder-*all.jar [-ed] text" + System.lineSeparator() +
"Encode and decode URL parameters." + System.lineSeparator() + " -e encode (default) " + "Encode and decode URL components defensively." + System.lineSeparator() + " -e encode (default) " +
System.lineSeparator() + " -d decode" System.lineSeparator() + " -d decode"
// see https://www.rfc-editor.org/rfc/rfc3986#page-13 // see https://www.rfc-editor.org/rfc/rfc3986#page-13
private val unreservedChars = BitSet('~'.code + 1).apply { // and https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
private val unreservedChars = BitSet('z'.code + 1).apply {
set('-') set('-')
set('.') set('.')
for (c in '0'..'9') { for (c in '0'..'9') {
@ -50,14 +59,14 @@ object UrlEncoder {
for (c in 'a'.code..'z'.code) { for (c in 'a'.code..'z'.code) {
set(c) set(c)
} }
set('~')
} }
private fun BitSet.set(c: Char) = this.set(c.code) private fun BitSet.set(c: Char) = this.set(c.code)
// see https://www.rfc-editor.org/rfc/rfc3986#page-13 // see https://www.rfc-editor.org/rfc/rfc3986#page-13
// and https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
private fun Char.isUnreserved(): Boolean { private fun Char.isUnreserved(): Boolean {
return this <= '~' && unreservedChars.get(code) return this <= 'z' && unreservedChars.get(code)
} }
private fun StringBuilder.appendEncodedDigit(digit: Int) { private fun StringBuilder.appendEncodedDigit(digit: Int) {
@ -130,7 +139,7 @@ object UrlEncoder {
* Transforms a provided [String] object into a new string, containing only valid URL characters in the UTF-8 * Transforms a provided [String] object into a new string, containing only valid URL characters in the UTF-8
* encoding. * encoding.
* *
* - Letters, numbers, unreserved (`_-!.~'()*`) and allowed characters are left intact. * - Letters, numbers, unreserved (`_-!.'()*`) and allowed characters are left intact.
*/ */
@JvmStatic @JvmStatic
fun encode(source: String, allow: String): String { fun encode(source: String, allow: String): String {
@ -177,7 +186,7 @@ object UrlEncoder {
* Transforms a provided [String] object into a new string, containing only valid URL characters in the UTF-8 * Transforms a provided [String] object into a new string, containing only valid URL characters in the UTF-8
* encoding. * encoding.
* *
* - Letters, numbers, unreserved (`_-!.~'()*`) and allowed characters are left intact. * - Letters, numbers, unreserved (`_-!.'()*`) and allowed characters are left intact.
*/ */
@JvmStatic @JvmStatic
fun encode(source: String, vararg allow: Char): String { fun encode(source: String, vararg allow: Char): String {
@ -187,7 +196,7 @@ object UrlEncoder {
/** /**
* Encodes and decodes URLs from the command line. * Encodes and decodes URLs from the command line.
* *
* - `kotlin -cp urlencoder-*.jar net.thauvin.erik.urlencoder.UrlEncoder` * - `java -jar urlencoder-*all.jar <text>`
*/ */
@JvmStatic @JvmStatic
fun main(args: Array<String>) { fun main(args: Array<String>) {
@ -200,7 +209,7 @@ object UrlEncoder {
} }
exitProcess(result.status) exitProcess(result.status)
} catch (e: IllegalArgumentException) { } catch (e: IllegalArgumentException) {
System.err.println("${UrlEncoder::class.java.simpleName}: ${e.message}"); System.err.println("${UrlEncoder::class.java.simpleName}: ${e.message}")
exitProcess(1) exitProcess(1)
} }
} }

View file

@ -34,7 +34,7 @@ import org.junit.jupiter.params.provider.ValueSource
import java.util.stream.Stream import java.util.stream.Stream
class UrlEncoderTest { class UrlEncoderTest {
private val same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~" private val same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_."
companion object { companion object {
@JvmStatic @JvmStatic
@ -45,7 +45,7 @@ class UrlEncoderTest {
arguments("a test &", "a%20test%20%26"), arguments("a test &", "a%20test%20%26"),
arguments( arguments(
"!abcdefghijklmnopqrstuvwxyz%%ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~=", "!abcdefghijklmnopqrstuvwxyz%%ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~=",
"%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~%3D" "%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.%7E%3D"
), ),
arguments("%#okékÉȢ smile!😁", "%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81"), arguments("%#okékÉȢ smile!😁", "%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81"),
arguments( arguments(