Change how hashtags are normalized (#18795)
* Change how hashtags are normalized * Fix tests
This commit is contained in:
10
app/lib/ascii_folding.rb
Normal file
10
app/lib/ascii_folding.rb
Normal file
@ -0,0 +1,10 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
class ASCIIFolding
|
||||
NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
|
||||
EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
|
||||
|
||||
def fold(str)
|
||||
str.tr(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS)
|
||||
end
|
||||
end
|
25
app/lib/hashtag_normalizer.rb
Normal file
25
app/lib/hashtag_normalizer.rb
Normal file
@ -0,0 +1,25 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
class HashtagNormalizer
|
||||
def normalize(str)
|
||||
remove_invalid_characters(ascii_folding(lowercase(cjk_width(str))))
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def remove_invalid_characters(str)
|
||||
str.gsub(/[^[:alnum:]#{Tag::HASHTAG_SEPARATORS}]/, '')
|
||||
end
|
||||
|
||||
def ascii_folding(str)
|
||||
ASCIIFolding.new.fold(str)
|
||||
end
|
||||
|
||||
def lowercase(str)
|
||||
str.mb_chars.downcase.to_s
|
||||
end
|
||||
|
||||
def cjk_width(str)
|
||||
str.unicode_normalize(:nfkc)
|
||||
end
|
||||
end
|
Reference in New Issue
Block a user