diff --git a/JSONML.java b/JSONML.java
index 7e99aab..82853a9 100644
--- a/JSONML.java
+++ b/JSONML.java
@@ -175,7 +175,7 @@ public class JSONML {
if (!(token instanceof String)) {
throw x.syntaxError("Missing value");
}
- newjo.accumulate(attribute, keepStrings ? token :JSONObject.stringToValue((String)token));
+ newjo.accumulate(attribute, keepStrings ? XML.unescape((String)token) :XML.stringToValue((String)token));
token = null;
} else {
newjo.accumulate(attribute, "");
@@ -226,7 +226,7 @@ public class JSONML {
} else {
if (ja != null) {
ja.put(token instanceof String
- ? keepStrings ? token :JSONObject.stringToValue((String)token)
+ ? keepStrings ? XML.unescape((String)token) :XML.stringToValue((String)token)
: token);
}
}
diff --git a/XML.java b/XML.java
index 0f0bee9..78dd6a0 100644
--- a/XML.java
+++ b/XML.java
@@ -35,7 +35,6 @@ import java.util.Iterator;
*/
@SuppressWarnings("boxing")
public class XML {
-
/** The Character '&'. */
public static final Character AMP = '&';
@@ -62,6 +61,46 @@ public class XML {
/** The Character '/'. */
public static final Character SLASH = '/';
+
+ /**
+ * Creates an iterator for navigating Code Points in a string instead of
+ * characters. Once Java7 support is dropped, this can be replaced with
+ *
+ * string.codePoints()
+ *
+ * which is available in Java8 and above.
+ *
+ * @see http://stackoverflow.com/a/21791059/6030888
+ */
+ private static Iterable codePointIterator(final String string) {
+ return new Iterable() {
+ @Override
+ public Iterator iterator() {
+ return new Iterator() {
+ private int nextIndex = 0;
+ private int length = string.length();
+
+ @Override
+ public boolean hasNext() {
+ return this.nextIndex < this.length;
+ }
+
+ @Override
+ public Integer next() {
+ int result = string.codePointAt(this.nextIndex);
+ this.nextIndex += Character.charCount(result);
+ return result;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ };
+ }
/**
* Replace special characters with XML escapes:
@@ -71,6 +110,7 @@ public class XML {
* < (less than) is replaced by <
* > (greater than) is replaced by >
* " (double quote) is replaced by "
+ * ' (single quote / apostrophe) is replaced by '
*
*
* @param string
@@ -79,9 +119,8 @@ public class XML {
*/
public static String escape(String string) {
StringBuilder sb = new StringBuilder(string.length());
- for (int i = 0, length = string.length(); i < length; i++) {
- char c = string.charAt(i);
- switch (c) {
+ for (final int cp : codePointIterator(string)) {
+ switch (cp) {
case '&':
sb.append("&");
break;
@@ -98,6 +137,93 @@ public class XML {
sb.append("'");
break;
default:
+ if (mustEscape(cp)) {
+ sb.append("");
+ sb.append(Integer.toHexString(cp));
+ sb.append(";");
+ } else {
+ sb.appendCodePoint(cp);
+ }
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
+ * @param cp code point to test
+ * @return true if the code point is not valid for an XML
+ */
+ private static boolean mustEscape(int cp) {
+ /* Valid range from https://www.w3.org/TR/REC-xml/#charsets
+ *
+ * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+ *
+ * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
+ */
+ // isISOControl is true when (cp >= 0 && cp <= 0x1F) || (cp >= 0x7F && cp <= 0x9F)
+ // all ISO control characters are out of range except tabs and new lines
+ return (Character.isISOControl(cp)
+ && cp != 0x9
+ && cp != 0xA
+ && cp != 0xD
+ ) || !(
+ // valid the range of acceptable characters that aren't control
+ (cp >= 0x20 && cp <= 0xD7FF)
+ || (cp >= 0xE000 && cp <= 0xFFFD)
+ || (cp >= 0x10000 && cp <= 0x10FFFF)
+ )
+ ;
+ }
+
+ /**
+ * Removes XML escapes from the string.
+ *
+ * @param string
+ * string to remove escapes from
+ * @return string with converted entities
+ */
+ public static String unescape(String string) {
+ StringBuilder sb = new StringBuilder(string.length());
+ for (int i = 0, length = string.length(); i < length; i++) {
+ char c = string.charAt(i);
+ if (c == '&') {
+ final int semic = string.indexOf(';', i);
+ if (semic > i) {
+ final String entity = string.substring(i + 1, semic);
+ if (entity.charAt(0) == '#') {
+ int cp;
+ if (entity.charAt(1) == 'x') {
+ // hex encoded unicode
+ cp = Integer.parseInt(entity.substring(2), 16);
+ } else {
+ // decimal encoded unicode
+ cp = Integer.parseInt(entity.substring(1));
+ }
+ sb.appendCodePoint(cp);
+ } else {
+ if ("quot".equalsIgnoreCase(entity)) {
+ sb.append('"');
+ } else if ("amp".equalsIgnoreCase(entity)) {
+ sb.append('&');
+ } else if ("apos".equalsIgnoreCase(entity)) {
+ sb.append('\'');
+ } else if ("lt".equalsIgnoreCase(entity)) {
+ sb.append('<');
+ } else if ("gt".equalsIgnoreCase(entity)) {
+ sb.append('>');
+ } else {
+ sb.append('&').append(entity).append(';');
+ }
+ }
+ // skip past the entity we just parsed.
+ i += entity.length() + 1;
+ } else {
+ // this shouldn't happen in most cases since the parser
+ // errors on unclosed enties.
+ sb.append(c);
+ }
+ } else {
+ // not part of an entity
sb.append(c);
}
}
@@ -227,7 +353,6 @@ public class XML {
if (token == null) {
token = x.nextToken();
}
-
// attribute = value
if (token instanceof String) {
string = (String) token;
@@ -238,7 +363,7 @@ public class XML {
throw x.syntaxError("Missing value");
}
jsonobject.accumulate(string,
- keepStrings ? token : JSONObject.stringToValue((String) token));
+ keepStrings ? unescape((String)token) : stringToValue((String) token));
token = null;
} else {
jsonobject.accumulate(string, "");
@@ -270,7 +395,7 @@ public class XML {
string = (String) token;
if (string.length() > 0) {
jsonobject.accumulate("content",
- keepStrings ? token : JSONObject.stringToValue(string));
+ keepStrings ? unescape(string) : stringToValue(string));
}
} else if (token == LT) {
@@ -297,16 +422,18 @@ public class XML {
}
/**
- * This method has been deprecated in favor of the
- * {@link JSONObject.stringToValue(String)} method. Use it instead.
+ * This method is the same as {@link JSONObject.stringToValue(String)}
+ * except that this also tries to unescape String values.
*
- * @deprecated Use JSONObject#stringToValue(String) instead.
* @param string String to convert
* @return JSON value of this string or the string
*/
- @Deprecated
public static Object stringToValue(String string) {
- return JSONObject.stringToValue(string);
+ Object ret = JSONObject.stringToValue(string);
+ if(ret instanceof String){
+ return unescape((String)ret);
+ }
+ return ret;
}
/**