This is a sample Java client for the ANTLR 4 Unicode grapheme cluster parser grammar:
https://github.com/antlr/grammars-v4/tree/master/unicode/graphemes
% mvn install
import com.github.bhamiltoncx.UnicodeGraphemeParsing;
public class Example {
public static void main(String[] strings) {
for (String string : strings) {
System.out.format("Parsing string: %s\n", string);
for (UnicodeGraphemeParsing.Result grapheme : UnicodeGraphemeParsing.parse(string)) {
String s = string.substring(grapheme.stringOffset, grapheme.stringOffset + grapheme.stringLength);
String type = (grapheme.type == UnicodeGraphemeParsing.Result.Type.EMOJI ? "Emoji" : "Non-Emoji");
System.out.format("%s: [%s] (offset=%d, length=%d)\n", type, s, grapheme.stringOffset, grapheme.stringLength);
}
}
}
}
% mvn install
% javac -cp target/unicode-graphemes-0.1-SNAPSHOT.jar example/Example.java
% alias parse-graphemes="java -cp $HOME/.m2/repository/org/antlr/antlr4-runtime/4.7/antlr4-runtime-4.7.jar:$HOME/.m2//repository/com/github/bhamiltoncx/unicode-graphemes/0.1-SNAPSHOT/unicode-graphemes-0.1-SNAPSHOT.jar:example Example"
% parse-graphemes abc๐๐ฉ๐ฎ๐ฟโโ๏ธ๐ฉโ๐ฉโ๐งโ๐ง๐ซ๐ฎ
Parsing string: abc๐๐ฉ๐ฎ๐ฟโโ๏ธ๐ฉโ๐ฉโ๐งโ๐ง๐ซ๐ฎ
Non-Emoji: [a] (offset=0, length=1)
Non-Emoji: [b] (offset=1, length=1)
Non-Emoji: [c] (offset=2, length=1)
Emoji: [๐] (offset=3, length=2)
Emoji: [๐ฉ] (offset=5, length=2)
Emoji: [๐ฎ๐ฟโโ๏ธ] (offset=7, length=7)
Emoji: [๐ฉโ๐ฉโ๐งโ๐ง] (offset=14, length=11)
Emoji: [๐ซ๐ฎ] (offset=25, length=4)