Skip to content

Commit 4c1ff57

Browse files
authored
Update emoji regex (#11584)
When matching emoji, use a regex built from the data we have instead of something generic using unicode ranges. A generic regex can't tell the difference between two separate emoji next to each other or one emoji that is built out of two separate emoji next to each other. This means that emoji that are next to each other without space in between will be now accurately spanned individually with proper title etc...
1 parent 02fa329 commit 4c1ff57

File tree

3 files changed

+31
-7
lines changed

3 files changed

+31
-7
lines changed

modules/emoji/emoji.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
package emoji
77

88
import (
9+
"sort"
910
"strings"
1011
"sync"
12+
"unicode/utf8"
1113
)
1214

1315
// Gemoji is a set of emoji data.
@@ -48,6 +50,12 @@ func loadMap() {
4850
// process emoji codes and aliases
4951
codePairs := make([]string, 0)
5052
aliasPairs := make([]string, 0)
53+
54+
// sort from largest to small so we match combined emoji first
55+
sort.Slice(GemojiData, func(i, j int) bool {
56+
return len(GemojiData[i].Emoji) > len(GemojiData[j].Emoji)
57+
})
58+
5159
for i, e := range GemojiData {
5260
if e.Emoji == "" || len(e.Aliases) == 0 {
5361
continue
@@ -72,6 +80,7 @@ func loadMap() {
7280
codeReplacer = strings.NewReplacer(codePairs...)
7381
aliasReplacer = strings.NewReplacer(aliasPairs...)
7482
})
83+
7584
}
7685

7786
// FromCode retrieves the emoji data based on the provided unicode code (ie,
@@ -117,3 +126,21 @@ func ReplaceAliases(s string) string {
117126
loadMap()
118127
return aliasReplacer.Replace(s)
119128
}
129+
130+
// FindEmojiSubmatchIndex returns index pair of longest emoji in a string
131+
func FindEmojiSubmatchIndex(s string) []int {
132+
loadMap()
133+
134+
// if rune and string length are the same then no emoji will be present
135+
// similar performance when there is unicode present but almost 200% faster when not
136+
if utf8.RuneCountInString(s) == len(s) {
137+
return nil
138+
}
139+
for j := range GemojiData {
140+
i := strings.Index(s, GemojiData[j].Emoji)
141+
if i != -1 {
142+
return []int{i, i + len(GemojiData[j].Emoji)}
143+
}
144+
}
145+
return nil
146+
}

modules/markup/html.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,6 @@ var (
6565

6666
// EmojiShortCodeRegex find emoji by alias like :smile:
6767
EmojiShortCodeRegex = regexp.MustCompile(`\:[\w\+\-]+\:{1}`)
68-
69-
// find emoji literal: search all emoji hex range as many times as they appear as
70-
// some emojis (skin color etc..) are just two or more chained together
71-
emojiRegex = regexp.MustCompile(`[\x{1F000}-\x{1FFFF}|\x{2000}-\x{32ff}|\x{fe4e5}-\x{fe4ee}|\x{200D}|\x{FE0F}|\x{e0000}-\x{e007f}]+`)
7268
)
7369

7470
// CSS class for action keywords (e.g. "closes: #1")
@@ -922,8 +918,7 @@ func emojiShortCodeProcessor(ctx *postProcessCtx, node *html.Node) {
922918

923919
// emoji processor to match emoji and add emoji class
924920
func emojiProcessor(ctx *postProcessCtx, node *html.Node) {
925-
m := emojiRegex.FindStringSubmatchIndex(node.Data)
926-
921+
m := emoji.FindEmojiSubmatchIndex(node.Data)
927922
if m == nil {
928923
return
929924
}

modules/markup/html_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,9 @@ func TestRender_emoji(t *testing.T) {
263263
test(
264264
"Some text with :smile: in the middle",
265265
`<p>Some text with <span class="emoji" aria-label="grinning face with smiling eyes">😄</span> in the middle</p>`)
266-
266+
test(
267+
"Some text with 😄😄 2 emoji next to each other",
268+
`<p>Some text with <span class="emoji" aria-label="grinning face with smiling eyes">😄</span><span class="emoji" aria-label="grinning face with smiling eyes">😄</span> 2 emoji next to each other</p>`)
267269
// should match nothing
268270
test(
269271
"2001:0db8:85a3:0000:0000:8a2e:0370:7334",

0 commit comments

Comments
 (0)