From a2d3b59394d62074debc4903e46c3aa97179172e Mon Sep 17 00:00:00 2001 From: "John J. Aylward" Date: Thu, 22 Sep 2016 12:38:06 -0400 Subject: [PATCH 1/9] Implements unicode escaping similar to JSONObject. * Removes deprecation on XML.stringToValue(). It now provides unescaping for strings to convert XML entities back into values. * New unescape function to handle XML entities -> value conversion. --- JSONML.java | 4 +-- XML.java | 80 +++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 73 insertions(+), 11 deletions(-) diff --git a/JSONML.java b/JSONML.java index 7e99aab..9f861eb 100644 --- a/JSONML.java +++ b/JSONML.java @@ -175,7 +175,7 @@ public class JSONML { if (!(token instanceof String)) { throw x.syntaxError("Missing value"); } - newjo.accumulate(attribute, keepStrings ? token :JSONObject.stringToValue((String)token)); + newjo.accumulate(attribute, keepStrings ? token :XML.stringToValue((String)token)); token = null; } else { newjo.accumulate(attribute, ""); @@ -226,7 +226,7 @@ public class JSONML { } else { if (ja != null) { ja.put(token instanceof String - ? keepStrings ? token :JSONObject.stringToValue((String)token) + ? keepStrings ? token :XML.stringToValue((String)token) : token); } } diff --git a/XML.java b/XML.java index 0f0bee9..833488a 100644 --- a/XML.java +++ b/XML.java @@ -35,7 +35,6 @@ import java.util.Iterator; */ @SuppressWarnings("boxing") public class XML { - /** The Character '&'. */ public static final Character AMP = '&'; @@ -71,6 +70,7 @@ public class XML { * < (less than) is replaced by &lt; * > (greater than) is replaced by &gt; * " (double quote) is replaced by &quot; + * ' (single quote / apostrophe) is replaced by &apos; * * * @param string @@ -98,6 +98,67 @@ public class XML { sb.append("'"); break; default: + if (c < ' ' || (c >= '\u0080' && c < '\u00a0') || (c >= '\u2000' && c < '\u2100')) { + sb.append("&#x"); + sb.append(Integer.toHexString(c)); + sb.append(";"); + } else { + sb.append(c); + } + } + } + return sb.toString(); + } + + /** + * Removes XML escapes from the string. + * + * @param string + * string to remove escapes from + * @return string with converted entities + */ + public static String unescape(String string) { + StringBuilder sb = new StringBuilder(string.length()); + for (int i = 0, length = string.length(); i < length; i++) { + char c = string.charAt(i); + if (c == AMP) { + final int semic = string.indexOf(';', i); + if (semic > i) { + final String entity = string.substring(i + 1, semic); + if (entity.charAt(0) == '#') { + char cc; + if (entity.charAt(1) == 'x') { + // hex encoded unicode + cc = (char) Integer.parseInt(entity.substring(2), 16); + } else { + // decimal encoded unicode + cc = (char) Integer.parseInt(entity.substring(1)); + } + sb.append(cc); + } else { + if ("quot".equalsIgnoreCase(entity)) { + sb.append('"'); + } else if ("amp".equalsIgnoreCase(entity)) { + sb.append(AMP); + } else if ("apos".equalsIgnoreCase(entity)) { + sb.append('\''); + } else if ("lt".equalsIgnoreCase(entity)) { + sb.append('<'); + } else if ("gt".equalsIgnoreCase(entity)) { + sb.append('>'); + } else { + sb.append(AMP).append(entity).append(';'); + } + } + // skip past the entity we just parsed. + i += entity.length() + 1; + } else { + // this shouldn't happen in most cases since the parser + // errors on unclosed enties. + sb.append(c); + } + } else { + // not part of an entity sb.append(c); } } @@ -227,7 +288,6 @@ public class XML { if (token == null) { token = x.nextToken(); } - // attribute = value if (token instanceof String) { string = (String) token; @@ -238,7 +298,7 @@ public class XML { throw x.syntaxError("Missing value"); } jsonobject.accumulate(string, - keepStrings ? token : JSONObject.stringToValue((String) token)); + keepStrings ? unescape((String)token) : stringToValue((String) token)); token = null; } else { jsonobject.accumulate(string, ""); @@ -270,7 +330,7 @@ public class XML { string = (String) token; if (string.length() > 0) { jsonobject.accumulate("content", - keepStrings ? token : JSONObject.stringToValue(string)); + keepStrings ? unescape(string) : stringToValue(string)); } } else if (token == LT) { @@ -297,16 +357,18 @@ public class XML { } /** - * This method has been deprecated in favor of the - * {@link JSONObject.stringToValue(String)} method. Use it instead. + * This method is the same as {@link JSONObject.stringToValue(String)} + * except that this also tries to unescape String values. * - * @deprecated Use JSONObject#stringToValue(String) instead. * @param string String to convert * @return JSON value of this string or the string */ - @Deprecated public static Object stringToValue(String string) { - return JSONObject.stringToValue(string); + Object ret = JSONObject.stringToValue(string); + if(ret instanceof String){ + return unescape((String)ret); + } + return ret; } /** From 34652a87061f2e71321d1175724c162a62b00239 Mon Sep 17 00:00:00 2001 From: "John J. Aylward" Date: Thu, 22 Sep 2016 14:13:14 -0400 Subject: [PATCH 2/9] Updates to iterate on code points instead of characters and changes the encoding to only encode control characters as defined by ISO standard. --- XML.java | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/XML.java b/XML.java index 833488a..025c3be 100644 --- a/XML.java +++ b/XML.java @@ -61,6 +61,42 @@ public class XML { /** The Character '/'. */ public static final Character SLASH = '/'; + + /** + * Creates an iterator for navigating Code Points in a string instead of + * characters. + * + * @see http://stackoverflow.com/a/21791059/6030888 + */ + private static Iterable codePointIterator(final String string) { + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + private int nextIndex = 0; + private int length = string.length(); + + @Override + public boolean hasNext() { + return this.nextIndex < this.length; + } + + @Override + public Integer next() { + int result = string.codePointAt(this.nextIndex); + this.nextIndex += Character.charCount(result); + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + }; + } /** * Replace special characters with XML escapes: @@ -79,8 +115,7 @@ public class XML { */ public static String escape(String string) { StringBuilder sb = new StringBuilder(string.length()); - for (int i = 0, length = string.length(); i < length; i++) { - char c = string.charAt(i); + for (final int c : codePointIterator(string)) { switch (c) { case '&': sb.append("&"); @@ -98,18 +133,18 @@ public class XML { sb.append("'"); break; default: - if (c < ' ' || (c >= '\u0080' && c < '\u00a0') || (c >= '\u2000' && c < '\u2100')) { + if (Character.isISOControl(c)) { sb.append("&#x"); sb.append(Integer.toHexString(c)); sb.append(";"); } else { - sb.append(c); + sb.append(new String(Character.toChars(c))); } } } return sb.toString(); } - + /** * Removes XML escapes from the string. * From 68f92eb39568b0d1f736dfa878a36e32137a8a69 Mon Sep 17 00:00:00 2001 From: "John J. Aylward" Date: Thu, 22 Sep 2016 14:40:39 -0400 Subject: [PATCH 3/9] Adds more javadoc. --- XML.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/XML.java b/XML.java index 025c3be..8e63c74 100644 --- a/XML.java +++ b/XML.java @@ -64,7 +64,11 @@ public class XML { /** * Creates an iterator for navigating Code Points in a string instead of - * characters. + * characters. Once Java7 support is dropped, this can be replaced with + * + * string.codePoints() + * + * which is available in Java8 and above. * * @see http://stackoverflow.com/a/21791059/6030888 From c11e09959c546740963c0b8627f815b9f29c941e Mon Sep 17 00:00:00 2001 From: "John J. Aylward" Date: Thu, 22 Sep 2016 15:40:26 -0400 Subject: [PATCH 4/9] Fixes code point output when unescaping code points. XML escapes are an entire code point, not surrogate pairs like in JSON. --- XML.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/XML.java b/XML.java index 8e63c74..890b2de 100644 --- a/XML.java +++ b/XML.java @@ -165,15 +165,15 @@ public class XML { if (semic > i) { final String entity = string.substring(i + 1, semic); if (entity.charAt(0) == '#') { - char cc; + int cp; if (entity.charAt(1) == 'x') { // hex encoded unicode - cc = (char) Integer.parseInt(entity.substring(2), 16); + cp = Integer.parseInt(entity.substring(2), 16); } else { // decimal encoded unicode - cc = (char) Integer.parseInt(entity.substring(1)); + cp = Integer.parseInt(entity.substring(1)); } - sb.append(cc); + sb.append(new String(Character.toChars(cp))); } else { if ("quot".equalsIgnoreCase(entity)) { sb.append('"'); From f58a0f468475b0277a123dbe44fedbefbdb993c9 Mon Sep 17 00:00:00 2001 From: "John J. Aylward" Date: Thu, 22 Sep 2016 16:10:49 -0400 Subject: [PATCH 5/9] fixes code point appends to string builder --- XML.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/XML.java b/XML.java index 890b2de..a740b0e 100644 --- a/XML.java +++ b/XML.java @@ -119,8 +119,8 @@ public class XML { */ public static String escape(String string) { StringBuilder sb = new StringBuilder(string.length()); - for (final int c : codePointIterator(string)) { - switch (c) { + for (final int cp : codePointIterator(string)) { + switch (cp) { case '&': sb.append("&"); break; @@ -137,12 +137,12 @@ public class XML { sb.append("'"); break; default: - if (Character.isISOControl(c)) { + if (Character.isISOControl(cp)) { sb.append("&#x"); - sb.append(Integer.toHexString(c)); + sb.append(Integer.toHexString(cp)); sb.append(";"); } else { - sb.append(new String(Character.toChars(c))); + sb.appendCodePoint(cp); } } } @@ -173,7 +173,7 @@ public class XML { // decimal encoded unicode cp = Integer.parseInt(entity.substring(1)); } - sb.append(new String(Character.toChars(cp))); + sb.appendCodePoint(cp); } else { if ("quot".equalsIgnoreCase(entity)) { sb.append('"'); From adb0478f66042bb35fcaad020fc76f7e1a5b6acd Mon Sep 17 00:00:00 2001 From: "John J. Aylward" Date: Thu, 22 Sep 2016 16:23:09 -0400 Subject: [PATCH 6/9] properly unescape tokens in JSONML for reversability. --- JSONML.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/JSONML.java b/JSONML.java index 9f861eb..82853a9 100644 --- a/JSONML.java +++ b/JSONML.java @@ -175,7 +175,7 @@ public class JSONML { if (!(token instanceof String)) { throw x.syntaxError("Missing value"); } - newjo.accumulate(attribute, keepStrings ? token :XML.stringToValue((String)token)); + newjo.accumulate(attribute, keepStrings ? XML.unescape((String)token) :XML.stringToValue((String)token)); token = null; } else { newjo.accumulate(attribute, ""); @@ -226,7 +226,7 @@ public class JSONML { } else { if (ja != null) { ja.put(token instanceof String - ? keepStrings ? token :XML.stringToValue((String)token) + ? keepStrings ? XML.unescape((String)token) :XML.stringToValue((String)token) : token); } } From fb1db9341ecd4173503f65e0d6309c1e1aca0a5e Mon Sep 17 00:00:00 2001 From: "John J. Aylward" Date: Wed, 28 Sep 2016 20:15:58 -0400 Subject: [PATCH 7/9] Changes encoding to better match the XML spec section 2.2 --- XML.java | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/XML.java b/XML.java index a740b0e..4ceec5e 100644 --- a/XML.java +++ b/XML.java @@ -137,7 +137,7 @@ public class XML { sb.append("'"); break; default: - if (Character.isISOControl(cp)) { + if (mustEscape(cp)) { sb.append("&#x"); sb.append(Integer.toHexString(cp)); sb.append(";"); @@ -149,6 +149,32 @@ public class XML { return sb.toString(); } + /** + * @param cp code point to test + * @return true if the code point is not valid for an XML + */ + private static boolean mustEscape(int cp) { + /* Valid range from https://www.w3.org/TR/REC-xml/#charsets + * + * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + * + * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. + */ + // isISOControl is true when (cp >= 0 && cp <= 0x1F) || (cp >= 0x7F && cp <= 0x9F) + // all ISO control characters are out of range except tabs and new lines + return (Character.isISOControl(cp) + && cp != 0x9 + && cp != 0xA + && cp != 0xD + ) || !( + // valid the range of acceptable characters that aren't control + (cp >= 0x20 && cp <= 0xD7FF) + || (cp >= 0xE000 && cp <= 0xFFFD) + || (cp >= 0x10000 && cp <= 0x10FFFF) + ) + ; + } + /** * Removes XML escapes from the string. * From e477d7002b71afdf997a2d4e19a0814ca75cb0c0 Mon Sep 17 00:00:00 2001 From: "John J. Aylward" Date: Wed, 28 Sep 2016 20:22:12 -0400 Subject: [PATCH 8/9] fixes object comparison --- XML.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/XML.java b/XML.java index 4ceec5e..047c701 100644 --- a/XML.java +++ b/XML.java @@ -186,7 +186,7 @@ public class XML { StringBuilder sb = new StringBuilder(string.length()); for (int i = 0, length = string.length(); i < length; i++) { char c = string.charAt(i); - if (c == AMP) { + if (c == '&') { final int semic = string.indexOf(';', i); if (semic > i) { final String entity = string.substring(i + 1, semic); @@ -204,7 +204,7 @@ public class XML { if ("quot".equalsIgnoreCase(entity)) { sb.append('"'); } else if ("amp".equalsIgnoreCase(entity)) { - sb.append(AMP); + sb.append('&'); } else if ("apos".equalsIgnoreCase(entity)) { sb.append('\''); } else if ("lt".equalsIgnoreCase(entity)) { @@ -212,7 +212,7 @@ public class XML { } else if ("gt".equalsIgnoreCase(entity)) { sb.append('>'); } else { - sb.append(AMP).append(entity).append(';'); + sb.append('&').append(entity).append(';'); } } // skip past the entity we just parsed. From 93ffca36c357c489cc283ecc2a103281b8b73037 Mon Sep 17 00:00:00 2001 From: "John J. Aylward" Date: Wed, 28 Sep 2016 20:23:30 -0400 Subject: [PATCH 9/9] fixes spacing --- XML.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/XML.java b/XML.java index 047c701..78dd6a0 100644 --- a/XML.java +++ b/XML.java @@ -166,12 +166,12 @@ public class XML { && cp != 0x9 && cp != 0xA && cp != 0xD - ) || !( - // valid the range of acceptable characters that aren't control - (cp >= 0x20 && cp <= 0xD7FF) - || (cp >= 0xE000 && cp <= 0xFFFD) - || (cp >= 0x10000 && cp <= 0x10FFFF) - ) + ) || !( + // valid the range of acceptable characters that aren't control + (cp >= 0x20 && cp <= 0xD7FF) + || (cp >= 0xE000 && cp <= 0xFFFD) + || (cp >= 0x10000 && cp <= 0x10FFFF) + ) ; }