Improved normalizing with support for common separators

This commit is contained in:
Erik C. Thauvin 2024-07-31 17:10:00 -07:00
parent c04fa16307
commit 957bb694c6
Signed by: erik
GPG key ID: 776702A6A2DA330E
2 changed files with 21 additions and 16 deletions

View file

@ -334,21 +334,20 @@ public final class RenderUtils {
return src; return src;
} }
var normalized = Normalizer.normalize(src.trim(), Normalizer.Form.NFD); var normalized = Normalizer.normalize(src.trim(), Normalizer.Form.NFD).toCharArray();
var sb = new StringBuilder(normalized.length());
boolean space = false; var sb = new StringBuilder(normalized.length);
for (var c : normalized.toCharArray()) { for (var i = 0; i < normalized.length; i++) {
if (c <= '\u007F') { // ascii only var c = normalized[i];
if (!space && c == ' ') { if (c <= '\u007F') { // ASCII only
space = true; if (" &()-_=[{]}\\|;:,<.>/".indexOf(c) != -1) { // common separators
if (!sb.isEmpty() && i != normalized.length - 1 && sb.charAt(sb.length() - 1) != '-') {
sb.append('-'); sb.append('-');
} else {
space = false;
if (c >= '0' && c <= '9' || c >= 'a' && c <= 'z') {
sb.append(c);
} else if (c >= 'A' && c <= 'Z') {
sb.append((char) (c + 32)); // lowercase
} }
} else if (c >= '0' && c <= '9' || c >= 'a' && c <= 'z') { // letters & digits
sb.append(c);
} else if (c >= 'A' && c <= 'Z') { // uppercase letters
sb.append((char) (c + 32)); // make lowercase
} }
} }
} }

View file

@ -96,8 +96,14 @@ class TestRenderUtils {
@Test @Test
void testNormalize() { void testNormalize() {
assertThat(RenderUtils.normalize("")).isEmpty(); assertThat(RenderUtils.normalize("")).as("empty").isEmpty();
assertThat(RenderUtils.normalize(SAMPLE_GERMAN)).isEqualTo("mochten-sie-ein-paar-apfel"); assertThat(RenderUtils.normalize(" &()-_=[{]}\\|;:,<.>/")).as("blank").isEmpty();
assertThat(RenderUtils.normalize(SAMPLE_GERMAN)).as("greman").isEqualTo("mochten-sie-ein-paar-apfel");
assertThat(RenderUtils.normalize("foo bar, <foo-bar>,foo:bar,foo;(bar), {foo} & bar=foo.bar[foo|bar]"))
.as("foo-bar")
.isEqualTo("foo-bar-foo-bar-foo-bar-foo-bar-foo-bar-foo-bar-foo-bar");
assertThat(RenderUtils.normalize("News for January 6, 2023 (Paris)")).as("docs example")
.isEqualTo("news-for-january-6-2023-paris");
} }
@Test @Test