Change how hashtags are normalized (#18795)

* Change how hashtags are normalized

* Fix tests
This commit is contained in:
Eugen Rochko
2022-07-13 15:03:28 +02:00
committed by GitHub
parent 12ed2d793b
commit e7aa2be828
29 changed files with 193 additions and 51 deletions

10
app/lib/ascii_folding.rb Normal file
View File

@ -0,0 +1,10 @@
# frozen_string_literal: true
class ASCIIFolding
NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
def fold(str)
str.tr(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS)
end
end

View File

@ -0,0 +1,25 @@
# frozen_string_literal: true
class HashtagNormalizer
def normalize(str)
remove_invalid_characters(ascii_folding(lowercase(cjk_width(str))))
end
private
def remove_invalid_characters(str)
str.gsub(/[^[:alnum:]#{Tag::HASHTAG_SEPARATORS}]/, '')
end
def ascii_folding(str)
ASCIIFolding.new.fold(str)
end
def lowercase(str)
str.mb_chars.downcase.to_s
end
def cjk_width(str)
str.unicode_normalize(:nfkc)
end
end