Convert urlencoder lib to Kotlin Multiplatform (#10)

* convert UrlEncoderUtil to be multiplatform compatible * convert lib tests to KMP * convert UrlEncoderTest to commonTest (but since there's only a JVM target there's no changes), and also convert mutable test data with read-only types. * Update copyright --------- Co-authored-by: Erik C. Thauvin <erik@thauvin.net>
2023-09-05 22:55:29 +02:00 · 2023-09-05 22:55:29 +02:00 · 8fcd629bce
commit 8fcd629bce
parent ae060f5bd2
11 changed files with 196 additions and 206 deletions
--- a/urlencoder-lib/src/commonMain/kotlin/net/thauvin/erik/urlencoder/Character.kt
+++ b/urlencoder-lib/src/commonMain/kotlin/net/thauvin/erik/urlencoder/Character.kt
@ -0,0 +1,73 @@
+/*
+ * Copyright 2001-2023 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package net.thauvin.erik.urlencoder
+
+import kotlin.Char.Companion.MIN_HIGH_SURROGATE
+import kotlin.Char.Companion.MIN_LOW_SURROGATE
+
+/**
+ * Kotlin Multiplatform equivalent for `java.lang.Character`
+ *
+ * @author <a href="https://github.com/aSemy">aSemy</a>
+ */
+
+internal object Character {
+
+    /**
+     * See https://www.tutorialspoint.com/java/lang/character_issupplementarycodepoint.htm
+     *
+     * Determines whether the specified character (Unicode code point) is in the supplementary character range.
+     * The supplementary character range in the Unicode system falls in `U+10000` to `U+10FFFF`.
+     *
+     * The Unicode code points are divided into two categories:
+     * Basic Multilingual Plane (BMP) code points and Supplementary code points.
+     * BMP code points are present in the range U+0000 to U+FFFF.
+     *
+     * Whereas, supplementary characters are rare characters that are not represented using the original 16-bit Unicode.
+     * For example, these type of characters are used in Chinese or Japanese scripts and hence, are required by the
+     * applications used in these countries.
+     *
+     * @returns `true` if the specified code point falls in the range of supplementary code points
+     * ([MIN_SUPPLEMENTARY_CODE_POINT] to [MAX_CODE_POINT], inclusive), `false` otherwise.
+     */
+    internal fun isSupplementaryCodePoint(codePoint: Int): Boolean =
+        codePoint in MIN_SUPPLEMENTARY_CODE_POINT..MAX_CODE_POINT
+
+    internal fun toCodePoint(highSurrogate: Char, lowSurrogate: Char): Int =
+        (highSurrogate.code shl 10) + lowSurrogate.code + SURROGATE_DECODE_OFFSET
+
+    /** Basic Multilingual Plane (BMP) */
+    internal fun isBmpCodePoint(codePoint: Int): Boolean = codePoint ushr 16 == 0
+
+    internal fun highSurrogateOf(codePoint: Int): Char =
+        ((codePoint ushr 10) + HIGH_SURROGATE_ENCODE_OFFSET.code).toChar()
+
+    internal fun lowSurrogateOf(codePoint: Int): Char =
+        ((codePoint and 0x3FF) + MIN_LOW_SURROGATE.code).toChar()
+
+//    private const val MIN_CODE_POINT: Int = 0x000000
+    private const val MAX_CODE_POINT: Int = 0x10FFFF
+
+    private const val MIN_SUPPLEMENTARY_CODE_POINT: Int = 0x10000
+
+    private const val SURROGATE_DECODE_OFFSET: Int =
+        MIN_SUPPLEMENTARY_CODE_POINT -
+          (MIN_HIGH_SURROGATE.code shl 10) -
+          MIN_LOW_SURROGATE.code
+
+    private const val HIGH_SURROGATE_ENCODE_OFFSET: Char = MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT ushr 10)
+}
--- a/urlencoder-lib/src/commonMain/kotlin/net/thauvin/erik/urlencoder/UrlEncoderUtil.kt
+++ b/urlencoder-lib/src/commonMain/kotlin/net/thauvin/erik/urlencoder/UrlEncoderUtil.kt
@ -1,12 +1,11 @@
 /*
- * Copyright 2001-2023 Geert Bevin (gbevin[remove] at uwyn dot com)
- * Copyright 2022-2023 Erik C. Thauvin (erik@thauvin.net)
+ * Copyright 2001-2023 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
@ -17,8 +16,8 @@

 package net.thauvin.erik.urlencoder

-import java.nio.charset.StandardCharsets
-import java.util.BitSet
+import kotlin.jvm.JvmOverloads
+import kotlin.jvm.JvmStatic

 /**
 * Most defensive approach to URL encoding and decoding.
@ -39,20 +38,27 @@ import java.util.BitSet
 object UrlEncoderUtil {
    private val hexDigits = "0123456789ABCDEF".toCharArray()

-    // see https://www.rfc-editor.org/rfc/rfc3986#page-13
-    // and https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
-    private val unreservedChars = BitSet('z'.code + 1).apply {
-        set('-'.code)
-        set('.'.code)
-        for (c in '0'.code..'9'.code) {
-            set(c)
+    /**
+     * A [BooleanArray] with entries for the [character codes][Char.code] of
+     *
+     * * `0-9`,
+     * * `A-Z`,
+     * * `a-z`
+     *
+     * set to `true`.
+     */
+    private val unreservedChars = BooleanArray('z'.code + 1).apply {
+        set('-'.code, true)
+        set('.'.code, true)
+        set('_'.code, true)
+        for (c in '0'..'9') {
+            set(c.code, true)
        }
-        for (c in 'A'.code..'Z'.code) {
-            set(c)
+        for (c in 'A'..'Z') {
+            set(c.code, true)
        }
-        set('_'.code)
-        for (c in 'a'.code..'z'.code) {
-            set(c)
+        for (c in 'a'..'z') {
+            set(c.code, true)
        }
    }

@ -84,14 +90,13 @@ object UrlEncoderUtil {
        }

        val length = source.length
-        val out: StringBuilder by lazy { StringBuilder(length) }
-        var ch: Char
+        val out = StringBuilder(length)
        var bytesBuffer: ByteArray? = null
        var bytesPos = 0
        var i = 0
        var started = false
        while (i < length) {
-            ch = source[i]
+            val ch = source[i]
            if (ch == '%') {
                if (!started) {
                    out.append(source, 0, i)
@ -103,7 +108,7 @@ object UrlEncoderUtil {
                    bytesBuffer = ByteArray((length - i) / 3)
                }
                i++
-                require(length >= i + 2) { "Illegal escape sequence" }
+                require(length >= i + 2) { "Incomplete trailing escape ($ch) pattern" }
                try {
                    val v = source.substring(i, i + 2).toInt(16)
                    require(v in 0..0xFF) { "Illegal escape value" }
@ -114,7 +119,7 @@ object UrlEncoderUtil {
                }
            } else {
                if (bytesBuffer != null) {
-                    out.append(String(bytesBuffer, 0, bytesPos, StandardCharsets.UTF_8))
+                    out.append(bytesBuffer.decodeToString(0, bytesPos))
                    started = true
                    bytesBuffer = null
                    bytesPos = 0
@ -133,15 +138,15 @@ object UrlEncoderUtil {
        }

        if (bytesBuffer != null) {
-            out.append(String(bytesBuffer, 0, bytesPos, StandardCharsets.UTF_8))
+            out.append(bytesBuffer.decodeToString(0, bytesPos))
        }

        return if (!started) source else out.toString()
    }

    /**
-     * Transforms a provided [String] object into a new string, containing only valid URL characters in the UTF-8
-     * encoding.
+     * Transforms a provided [String] object into a new string, containing only valid URL
+     * characters in the UTF-8 encoding.
     *
     * - Letters, numbers, unreserved (`_-!.'()*`) and allowed characters are left intact.
     */
@ -152,11 +157,10 @@ object UrlEncoderUtil {
            return source
        }
        var out: StringBuilder? = null
-        var ch: Char
        var i = 0
        while (i < source.length) {
-            ch = source[i]
-            if (ch.isUnreserved() || allow.indexOf(ch) != -1) {
+            val ch = source[i]
+            if (ch.isUnreserved() || ch in allow) {
                out?.append(ch)
                i++
            } else {
@ -174,16 +178,18 @@ object UrlEncoderUtil {
                        }
                        i++
                    }
+
                    Character.isBmpCodePoint(cp) -> {
-                        for (b in ch.toString().toByteArray(StandardCharsets.UTF_8)) {
+                        for (b in ch.toString().encodeToByteArray()) {
                            out.appendEncodedByte(b.toInt())
                        }
                        i++
                    }
+
                    Character.isSupplementaryCodePoint(cp) -> {
-                        val high = Character.highSurrogate(cp)
-                        val low = Character.lowSurrogate(cp)
-                        for (b in charArrayOf(high, low).concatToString().toByteArray(StandardCharsets.UTF_8)) {
+                        val high = Character.highSurrogateOf(cp)
+                        val low = Character.lowSurrogateOf(cp)
+                        for (b in charArrayOf(high, low).concatToString().encodeToByteArray()) {
                            out.appendEncodedByte(b.toInt())
                        }
                        i += 2
@ -194,4 +200,48 @@ object UrlEncoderUtil {

        return out?.toString() ?: source
    }
+
+    /**
+     * Returns the Unicode code point at the specified index.
+     *
+     * The `index` parameter is the regular `CharSequence` index, i.e. the number of `Char`s from the start of the character
+     * sequence.
+     *
+     * If the code point at the specified index is part of the Basic Multilingual Plane (BMP), its value can be represented
+     * using a single `Char` and this method will behave exactly like [CharSequence.get].
+     * Code points outside the BMP are encoded using a surrogate pair – a `Char` containing a value in the high surrogate
+     * range followed by a `Char` containing a value in the low surrogate range. Together these two `Char`s encode a single
+     * code point in one of the supplementary planes. This method will do the necessary decoding and return the value of
+     * that single code point.
+     *
+     * In situations where surrogate characters are encountered that don't form a valid surrogate pair starting at `index`,
+     * this method will return the surrogate code point itself, behaving like [CharSequence.get].
+     *
+     * If the `index` is out of bounds of this character sequence, this method throws an [IndexOutOfBoundsException].
+     *
+     * ```kotlin
+     * // Text containing code points outside the BMP (encoded as a surrogate pairs)
+     * val text = "\uD83E\uDD95\uD83E\uDD96"
+     *
+     * var index = 0
+     * while (index < text.length) {
+     *     val codePoint = text.codePointAt(index)
+     *     // (Do something with codePoint...)
+     *     index += CodePoints.charCount(codePoint)
+     * }
+     * ```
+     */
+    private fun CharSequence.codePointAt(index: Int): Int {
+        if (index !in indices) throw IndexOutOfBoundsException("index $index was not in range $indices")
+
+        val firstChar = this[index]
+        if (firstChar.isHighSurrogate()) {
+            val nextChar = getOrNull(index + 1)
+            if (nextChar?.isLowSurrogate() == true) {
+                return Character.toCodePoint(firstChar, nextChar)
+            }
+        }
+
+        return firstChar.code
+    }
 }
--- a/urlencoder-lib/src/commonTest/kotlin/net/thauvin/erik/urlencoder/UrlEncoderUtilTest.kt
+++ b/urlencoder-lib/src/commonTest/kotlin/net/thauvin/erik/urlencoder/UrlEncoderUtilTest.kt
@ -1,12 +1,11 @@
 /*
- * Copyright 2001-2023 Geert Bevin (gbevin[remove] at uwyn dot com)
- * Copyright 2022-2023 Erik C. Thauvin (erik@thauvin.net)
+ * Copyright 2001-2023 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
@ -27,32 +26,26 @@ class UrlEncoderUtilTest {
    private val same = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_."

    companion object {
-        @JvmStatic
-        var invalid = arrayOf("sdkjfh%", "sdkjfh%6", "sdkjfh%xx", "sdfjfh%-1")
+        val invalid = listOf("sdkjfh%", "sdkjfh%6", "sdkjfh%xx", "sdfjfh%-1")

-        @JvmStatic
-        var validMap = arrayOf(
-            Pair("a test &", "a%20test%20%26"),
-            Pair(
-                "!abcdefghijklmnopqrstuvwxyz%%ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~=",
-                "%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.%7E%3D"
-            ),
-            Pair("%#okékÉȢ smile!😁", "%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81"),
-            Pair(
-                "\uD808\uDC00\uD809\uDD00\uD808\uDF00\uD808\uDD00", "%F0%92%80%80%F0%92%94%80%F0%92%8C%80%F0%92%84%80"
-            )
+        val validMap = listOf(
+            "a test &" to "a%20test%20%26",
+            "!abcdefghijklmnopqrstuvwxyz%%ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.~=" to
+              "%21abcdefghijklmnopqrstuvwxyz%25%25ABCDEFGHIJKLMNOPQRSTUVQXYZ0123456789-_.%7E%3D",
+            "%#okékÉȢ smile!😁" to "%25%23ok%C3%A9k%C3%89%C8%A2%20smile%21%F0%9F%98%81",
+            "\uD808\uDC00\uD809\uDD00\uD808\uDF00\uD808\uDD00" to "%F0%92%80%80%F0%92%94%80%F0%92%8C%80%F0%92%84%80",
        )
    }

    @Test
-    fun `Decode URL`() {
+    fun decodeURL() {
        for (m in validMap) {
            assertEquals(m.first, decode(m.second))
        }
    }

    @Test
-    fun `Decode with Exception`() {
+    fun decodeWithException() {
        for (source in invalid) {
            assertFailsWith<IllegalArgumentException>(
                message = "decode($source)",
@ -62,14 +55,14 @@ class UrlEncoderUtilTest {
    }

    @Test
-    fun `Decode when None needed`() {
+    fun decodeWhenNoneNeeded() {
        assertSame(same, decode(same))
        assertEquals("decode('')", decode(""), "")
        assertEquals("decode(' ')", decode(" "), " ")
    }

    @Test
-    fun `Decode with Plus to Space`() {
+    fun decodeWithPlusToSpace() {
        assertEquals("foo bar", decode("foo+bar", true))
        assertEquals("foo bar  foo", decode("foo+bar++foo", true))
        assertEquals("foo  bar  foo", decode("foo+%20bar%20+foo", true))
@ -78,34 +71,34 @@ class UrlEncoderUtilTest {
    }

    @Test
-    fun `Encode URL`() {
+    fun encodeURL() {
        for (m in validMap) {
            assertEquals(m.second, encode(m.first))
        }
    }

    @Test
-    fun `Encode Empty or Blank`() {
+    fun encodeEmptyOrBlank() {
        assertTrue(encode("", allow = "").isEmpty(), "encode('','')")
        assertEquals("encode('')", encode(""), "")
        assertEquals("encode(' ')", encode(" "), "%20")
    }

    @Test
-    fun `Encode when None needed`() {
+    fun encodeWhenNoneNeeded() {
        assertSame(encode(same), same)
        assertSame("with empty allow", encode(same, allow = ""), same)
    }

    @Test
-    fun `Encode with Allow`() {
-        assertEquals("encode(x, =?)","?test=a%20test", encode("?test=a test", allow = "=?"))
+    fun encodeWithAllow() {
+        assertEquals("encode(x, =?)", "?test=a%20test", encode("?test=a test", allow = "=?"))
        assertEquals("encode(aaa, a)", "aaa", encode("aaa", "a"))
-        assertEquals("encode(' ')", " ", encode(" ", " ") )
+        assertEquals("encode(' ')", " ", encode(" ", " "))
    }

    @Test
-    fun `Encode with Space to Plus`() {
+    fun encodeWithSpaceToPlus() {
        assertEquals("foo+bar", encode("foo bar", spaceToPlus = true))
        assertEquals("foo+bar++foo", encode("foo bar  foo", spaceToPlus = true))
        assertEquals("foo bar", encode("foo bar", " ", true))