diff --git a/internal/text/goldmark_parsers.go b/internal/text/goldmark_parsers.go index b7cf4f9e9..e2c87e057 100644 --- a/internal/text/goldmark_parsers.go +++ b/internal/text/goldmark_parsers.go @@ -177,7 +177,7 @@ func (p *hashtagParser) Parse( // Ignore initial '#'. continue - case !isPlausiblyInHashtag(r) && + case !isPermittedInHashtag(r) && !isHashtagBoundary(r): // Weird non-boundary character // in the hashtag. Don't trust it. diff --git a/internal/text/markdown_test.go b/internal/text/markdown_test.go index 98ed3a96b..153673415 100644 --- a/internal/text/markdown_test.go +++ b/internal/text/markdown_test.go @@ -50,6 +50,8 @@ withInlineCode2Expected = "
Nobody tells you about the </code><del>SECRET CODE</del><code>, do they?
here's a simple status that uses hashtag #Hashtag!
" + withTamilHashtag = "here's a simple status that uses a hashtag in Tamil #தமிழ்" + withTamilHashtagExpected = "here's a simple status that uses a hashtag in Tamil #தமிழ்
" mdWithHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a link.\n\nHere's an image:Here's a simple text in markdown.
Here's a link.
Here's an image:
" mdWithCheekyHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: " @@ -121,6 +123,12 @@ func (suite *MarkdownTestSuite) TestParseWithHashtag() { suite.Equal(withHashtagExpected, formatted.HTML) } +// Regressiom test for https://github.com/superseriousbusiness/gotosocial/issues/3618 +func (suite *MarkdownTestSuite) TestParseWithTamilHashtag() { + formatted := suite.FromMarkdown(withTamilHashtag) + suite.Equal(withTamilHashtagExpected, formatted.HTML) +} + func (suite *MarkdownTestSuite) TestParseWithHTML() { formatted := suite.FromMarkdown(mdWithHTML) suite.Equal(mdWithHTMLExpected, formatted.HTML) diff --git a/internal/text/normalize.go b/internal/text/normalize.go index d2e633d1e..ea266fb33 100644 --- a/internal/text/normalize.go +++ b/internal/text/normalize.go @@ -50,17 +50,16 @@ func NormalizeHashtag(text string) (string, bool) { // Validate normalized result. var ( - notJustUnderscores = false - onlyPermittedChars = true - lengthOK = true + atLeastOneRequiredChar = false + onlyPermittedChars = true + lengthOK = true ) for i, r := range normalized { - if r != '_' { - // This isn't an underscore, - // so the whole hashtag isn't - // just underscores. - notJustUnderscores = true + if !isPermittedIfNotEntireHashtag(r) { + // This isn't an underscore, mark, etc, + // so the hashtag contains at least one + atLeastOneRequiredChar = true } if i >= maximumHashtagLength { @@ -74,5 +73,5 @@ func NormalizeHashtag(text string) (string, bool) { } } - return normalized, (lengthOK && onlyPermittedChars && notJustUnderscores) + return normalized, lengthOK && onlyPermittedChars && atLeastOneRequiredChar } diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go index fac54a38e..ffa64ce44 100644 --- a/internal/text/plain_test.go +++ b/internal/text/plain_test.go @@ -118,20 +118,20 @@ func (suite *PlainTestSuite) TestDeriveHashtagsOK() { ` tags := suite.FromPlain(statusText).Tags - suite.Len(tags, 13) - suite.Equal("testing123", tags[0].Name) - suite.Equal("also", tags[1].Name) - suite.Equal("thisshouldwork", tags[2].Name) - suite.Equal("dupe", tags[3].Name) - suite.Equal("ThisShouldAlsoWork", tags[4].Name) - suite.Equal("this_should_not_be_split", tags[5].Name) - suite.Equal("111111", tags[6].Name) - suite.Equal("alimentación", tags[7].Name) - suite.Equal("saúde", tags[8].Name) - suite.Equal("lävistää", tags[9].Name) - suite.Equal("ö", tags[10].Name) - suite.Equal("네", tags[11].Name) - suite.Equal("ThisOneIsThirteyCharactersLong", tags[12].Name) + if suite.Len(tags, 12) { + suite.Equal("testing123", tags[0].Name) + suite.Equal("also", tags[1].Name) + suite.Equal("thisshouldwork", tags[2].Name) + suite.Equal("dupe", tags[3].Name) + suite.Equal("ThisShouldAlsoWork", tags[4].Name) + suite.Equal("this_should_not_be_split", tags[5].Name) + suite.Equal("alimentación", tags[6].Name) + suite.Equal("saúde", tags[7].Name) + suite.Equal("lävistää", tags[8].Name) + suite.Equal("ö", tags[9].Name) + suite.Equal("네", tags[10].Name) + suite.Equal("ThisOneIsThirteyCharactersLong", tags[11].Name) + } statusText = `#올빼미 hej` tags = suite.FromPlain(statusText).Tags @@ -170,8 +170,17 @@ func (suite *PlainTestSuite) TestDeriveMultiple() { func (suite *PlainTestSuite) TestZalgoHashtag() { statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?` f := suite.FromPlain(statusText) - suite.Len(f.Tags, 1) - suite.Equal("praying", f.Tags[0].Name) + if suite.Len(f.Tags, 2) { + suite.Equal("praying", f.Tags[0].Name) + // NFC doesn't do much for Zalgo text, but it's difficult to strip marks without affecting non-Latin text. + suite.Equal("z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪", f.Tags[1].Name) + } +} + +func (suite *PlainTestSuite) TestNumbersAreNotHashtags() { + statusText := `yo who else thinks #19_98 is #1?` + f := suite.FromPlain(statusText) + suite.Len(f.Tags, 0) } func TestPlainTestSuite(t *testing.T) { diff --git a/internal/text/util.go b/internal/text/util.go index af45cfaf0..47b2416dd 100644 --- a/internal/text/util.go +++ b/internal/text/util.go @@ -19,19 +19,14 @@ import "unicode" -func isPlausiblyInHashtag(r rune) bool { - // Marks are allowed during parsing - // prior to normalization, but not after, - // since they may be combined into letters - // during normalization. - return unicode.IsMark(r) || - isPermittedInHashtag(r) +func isPermittedInHashtag(r rune) bool { + return unicode.IsLetter(r) || isPermittedIfNotEntireHashtag(r) } -func isPermittedInHashtag(r rune) bool { - return unicode.IsLetter(r) || - unicode.IsNumber(r) || - r == '_' +// isPermittedIfNotEntireHashtag is true for characters that may be in a hashtag +// but are not allowed to be the only characters making up the hashtag. +func isPermittedIfNotEntireHashtag(r rune) bool { + return unicode.IsNumber(r) || unicode.IsMark(r) || r == '_' } // isHashtagBoundary returns true if rune r