Change how hashtags are normalized (#18795)
* Change how hashtags are normalized * Fix tests
This commit is contained in:
		| @@ -16,6 +16,8 @@ module Admin | ||||
|       if @tag.update(tag_params.merge(reviewed_at: Time.now.utc)) | ||||
|         redirect_to admin_tag_path(@tag.id), notice: I18n.t('admin.tags.updated_msg') | ||||
|       else | ||||
|         @time_period = (6.days.ago.to_date...Time.now.utc.to_date) | ||||
|  | ||||
|         render :show | ||||
|       end | ||||
|     end | ||||
| @@ -27,7 +29,7 @@ module Admin | ||||
|     end | ||||
|  | ||||
|     def tag_params | ||||
|       params.require(:tag).permit(:name, :trendable, :usable, :listable) | ||||
|       params.require(:tag).permit(:name, :display_name, :trendable, :usable, :listable) | ||||
|     end | ||||
|   end | ||||
| end | ||||
|   | ||||
| @@ -13,9 +13,7 @@ class Api::V1::FeaturedTagsController < Api::BaseController | ||||
|   end | ||||
|  | ||||
|   def create | ||||
|     @featured_tag = current_account.featured_tags.new(featured_tag_params) | ||||
|     @featured_tag.reset_data | ||||
|     @featured_tag.save! | ||||
|     @featured_tag = current_account.featured_tags.create!(featured_tag_params) | ||||
|     render json: @featured_tag, serializer: REST::FeaturedTagSerializer | ||||
|   end | ||||
|  | ||||
|   | ||||
| @@ -11,7 +11,6 @@ class Settings::FeaturedTagsController < Settings::BaseController | ||||
|  | ||||
|   def create | ||||
|     @featured_tag = current_account.featured_tags.new(featured_tag_params) | ||||
|     @featured_tag.reset_data | ||||
|  | ||||
|     if @featured_tag.save | ||||
|       redirect_to settings_featured_tags_path | ||||
|   | ||||
| @@ -606,7 +606,20 @@ function insertIntoTagHistory(recognizedTags, text) { | ||||
|     const state = getState(); | ||||
|     const oldHistory = state.getIn(['compose', 'tagHistory']); | ||||
|     const me = state.getIn(['meta', 'me']); | ||||
|     const names = recognizedTags.map(tag => text.match(new RegExp(`#${tag.name}`, 'i'))[0].slice(1)); | ||||
|  | ||||
|     // FIXME: Matching input hashtags with recognized hashtags has become more | ||||
|     // complicated because of new normalization rules, it's no longer just | ||||
|     // a case sensitivity issue | ||||
|     const names = recognizedTags.map(tag => { | ||||
|       const matches = text.match(new RegExp(`#${tag.name}`, 'i')); | ||||
|  | ||||
|       if (matches && matches.length > 0) { | ||||
|         return matches[0].slice(1); | ||||
|       } else { | ||||
|         return tag.name; | ||||
|       } | ||||
|     }); | ||||
|  | ||||
|     const intersectedOldHistory = oldHistory.filter(name => names.findIndex(newName => newName.toLowerCase() === name.toLowerCase()) === -1); | ||||
|  | ||||
|     names.push(...intersectedOldHistory.toJS()); | ||||
|   | ||||
							
								
								
									
										10
									
								
								app/lib/ascii_folding.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								app/lib/ascii_folding.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | ||||
| # frozen_string_literal: true | ||||
|  | ||||
| class ASCIIFolding | ||||
|   NON_ASCII_CHARS        = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž' | ||||
|   EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz' | ||||
|  | ||||
|   def fold(str) | ||||
|     str.tr(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS) | ||||
|   end | ||||
| end | ||||
							
								
								
									
										25
									
								
								app/lib/hashtag_normalizer.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								app/lib/hashtag_normalizer.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| # frozen_string_literal: true | ||||
|  | ||||
| class HashtagNormalizer | ||||
|   def normalize(str) | ||||
|     remove_invalid_characters(ascii_folding(lowercase(cjk_width(str)))) | ||||
|   end | ||||
|  | ||||
|   private | ||||
|  | ||||
|   def remove_invalid_characters(str) | ||||
|     str.gsub(/[^[:alnum:]#{Tag::HASHTAG_SEPARATORS}]/, '') | ||||
|   end | ||||
|  | ||||
|   def ascii_folding(str) | ||||
|     ASCIIFolding.new.fold(str) | ||||
|   end | ||||
|  | ||||
|   def lowercase(str) | ||||
|     str.mb_chars.downcase.to_s | ||||
|   end | ||||
|  | ||||
|   def cjk_width(str) | ||||
|     str.unicode_normalize(:nfkc) | ||||
|   end | ||||
| end | ||||
| @@ -62,7 +62,7 @@ class Account < ApplicationRecord | ||||
|   ) | ||||
|  | ||||
|   USERNAME_RE   = /[a-z0-9_]+([a-z0-9_\.-]+[a-z0-9_]+)?/i | ||||
|   MENTION_RE    = /(?<=^|[^\/[:word:]])@((#{USERNAME_RE})(?:@[[:word:]\.\-]+[[:word:]]+)?)/i | ||||
|   MENTION_RE    = /(?<=^|[^\/[:word:]])@((#{USERNAME_RE})(?:@[[:alnum:]\.\-]+[[:alnum:]]+)?)/i | ||||
|   URL_PREFIX_RE = /\Ahttp(s?):\/\/[^\/]+/ | ||||
|  | ||||
|   include Attachmentable | ||||
|   | ||||
| @@ -3,14 +3,14 @@ | ||||
| # | ||||
| # Table name: custom_filters | ||||
| # | ||||
| #  id         :bigint           not null, primary key | ||||
| #  account_id :bigint | ||||
| #  id         :bigint(8)        not null, primary key | ||||
| #  account_id :bigint(8) | ||||
| #  expires_at :datetime | ||||
| #  phrase     :text             default(""), not null | ||||
| #  context    :string           default([]), not null, is an Array | ||||
| #  created_at :datetime         not null | ||||
| #  updated_at :datetime         not null | ||||
| #  action     :integer          default(0), not null | ||||
| #  action     :integer          default("warn"), not null | ||||
| # | ||||
|  | ||||
| class CustomFilter < ApplicationRecord | ||||
|   | ||||
| @@ -3,8 +3,8 @@ | ||||
| # | ||||
| # Table name: custom_filter_keywords | ||||
| # | ||||
| #  id               :bigint           not null, primary key | ||||
| #  custom_filter_id :bigint           not null | ||||
| #  id               :bigint(8)        not null, primary key | ||||
| #  custom_filter_id :bigint(8)        not null | ||||
| #  keyword          :text             default(""), not null | ||||
| #  whole_word       :boolean          default(TRUE), not null | ||||
| #  created_at       :datetime         not null | ||||
|   | ||||
| @@ -13,17 +13,19 @@ | ||||
| # | ||||
|  | ||||
| class FeaturedTag < ApplicationRecord | ||||
|   belongs_to :account, inverse_of: :featured_tags, required: true | ||||
|   belongs_to :tag, inverse_of: :featured_tags, required: true | ||||
|   belongs_to :account, inverse_of: :featured_tags | ||||
|   belongs_to :tag, inverse_of: :featured_tags, optional: true # Set after validation | ||||
|  | ||||
|   delegate :name, to: :tag, allow_nil: true | ||||
|  | ||||
|   validates_associated :tag, on: :create | ||||
|   validates :name, presence: true, on: :create | ||||
|   validate :validate_tag_name, on: :create | ||||
|   validate :validate_featured_tags_limit, on: :create | ||||
|  | ||||
|   def name=(str) | ||||
|     self.tag = Tag.find_or_create_by_names(str.strip)&.first | ||||
|   before_create :set_tag | ||||
|   before_create :reset_data | ||||
|  | ||||
|   attr_writer :name | ||||
|  | ||||
|   def name | ||||
|     tag_id.present? ? tag.name : @name | ||||
|   end | ||||
|  | ||||
|   def increment(timestamp) | ||||
| @@ -34,14 +36,23 @@ class FeaturedTag < ApplicationRecord | ||||
|     update(statuses_count: [0, statuses_count - 1].max, last_status_at: account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).where.not(id: deleted_status_id).select(:created_at).first&.created_at) | ||||
|   end | ||||
|  | ||||
|   private | ||||
|  | ||||
|   def set_tag | ||||
|     self.tag = Tag.find_or_create_by_names(@name)&.first | ||||
|   end | ||||
|  | ||||
|   def reset_data | ||||
|     self.statuses_count = account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).count | ||||
|     self.last_status_at = account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).select(:created_at).first&.created_at | ||||
|   end | ||||
|  | ||||
|   private | ||||
|  | ||||
|   def validate_featured_tags_limit | ||||
|     errors.add(:base, I18n.t('featured_tags.errors.limit')) if account.featured_tags.count >= 10 | ||||
|   end | ||||
|  | ||||
|   def validate_tag_name | ||||
|     errors.add(:name, :blank) if @name.blank? | ||||
|     errors.add(:name, :invalid) unless @name.match?(/\A(#{Tag::HASHTAG_NAME_RE})\z/i) | ||||
|   end | ||||
| end | ||||
|   | ||||
| @@ -15,6 +15,7 @@ | ||||
| #  last_status_at      :datetime | ||||
| #  max_score           :float | ||||
| #  max_score_at        :datetime | ||||
| #  display_name        :string | ||||
| # | ||||
|  | ||||
| class Tag < ApplicationRecord | ||||
| @@ -24,11 +25,12 @@ class Tag < ApplicationRecord | ||||
|   has_many :featured_tags, dependent: :destroy, inverse_of: :tag | ||||
|  | ||||
|   HASHTAG_SEPARATORS = "_\u00B7\u200c" | ||||
|   HASHTAG_NAME_RE    = "([[:word:]_][[:word:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:word:]#{HASHTAG_SEPARATORS}]*[[:word:]_])|([[:word:]_]*[[:alpha:]][[:word:]_]*)" | ||||
|   HASHTAG_NAME_RE    = "([[:alnum:]_][[:alnum:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:alnum:]#{HASHTAG_SEPARATORS}]*[[:alnum:]_])|([[:alnum:]_]*[[:alpha:]][[:alnum:]_]*)" | ||||
|   HASHTAG_RE         = /(?:^|[^\/\)\w])#(#{HASHTAG_NAME_RE})/i | ||||
|  | ||||
|   validates :name, presence: true, format: { with: /\A(#{HASHTAG_NAME_RE})\z/i } | ||||
|   validate :validate_name_change, if: -> { !new_record? && name_changed? } | ||||
|   validate :validate_display_name_change, if: -> { !new_record? && display_name_changed? } | ||||
|  | ||||
|   scope :reviewed, -> { where.not(reviewed_at: nil) } | ||||
|   scope :unreviewed, -> { where(reviewed_at: nil) } | ||||
| @@ -46,6 +48,10 @@ class Tag < ApplicationRecord | ||||
|     name | ||||
|   end | ||||
|  | ||||
|   def display_name | ||||
|     attributes['display_name'] || name | ||||
|   end | ||||
|  | ||||
|   def usable | ||||
|     boolean_with_default('usable', true) | ||||
|   end | ||||
| @@ -90,8 +96,10 @@ class Tag < ApplicationRecord | ||||
|  | ||||
|   class << self | ||||
|     def find_or_create_by_names(name_or_names) | ||||
|       Array(name_or_names).map(&method(:normalize)).uniq { |str| str.mb_chars.downcase.to_s }.map do |normalized_name| | ||||
|         tag = matching_name(normalized_name).first || create(name: normalized_name) | ||||
|       names = Array(name_or_names).map { |str| [normalize(str), str] }.uniq(&:first) | ||||
|  | ||||
|       names.map do |(normalized_name, display_name)| | ||||
|         tag = matching_name(normalized_name).first || create(name: normalized_name, display_name: display_name) | ||||
|  | ||||
|         yield tag if block_given? | ||||
|  | ||||
| @@ -129,7 +137,7 @@ class Tag < ApplicationRecord | ||||
|     end | ||||
|  | ||||
|     def normalize(str) | ||||
|       str.gsub(/\A#/, '') | ||||
|       HashtagNormalizer.new.normalize(str) | ||||
|     end | ||||
|   end | ||||
|  | ||||
| @@ -138,4 +146,8 @@ class Tag < ApplicationRecord | ||||
|   def validate_name_change | ||||
|     errors.add(:name, I18n.t('tags.does_not_match_previous_name')) unless name_was.mb_chars.casecmp(name.mb_chars).zero? | ||||
|   end | ||||
|  | ||||
|   def validate_display_name_change | ||||
|     errors.add(:display_name, I18n.t('tags.does_not_match_previous_name')) unless HashtagNormalizer.new.normalize(display_name).casecmp(name.mb_chars).zero? | ||||
|   end | ||||
| end | ||||
|   | ||||
| @@ -10,11 +10,11 @@ class ActivityPub::HashtagSerializer < ActivityPub::Serializer | ||||
|   end | ||||
|  | ||||
|   def name | ||||
|     "##{object.name}" | ||||
|     "##{object.display_name}" | ||||
|   end | ||||
|  | ||||
|   def href | ||||
|     if object.class.name == 'FeaturedTag' | ||||
|     if object.instance_of?(FeaturedTag) | ||||
|       short_account_tag_url(object.account, object.tag) | ||||
|     else | ||||
|       tag_url(object) | ||||
|   | ||||
| @@ -12,4 +12,8 @@ class REST::FeaturedTagSerializer < ActiveModel::Serializer | ||||
|   def url | ||||
|     short_account_tag_url(object.account, object.tag) | ||||
|   end | ||||
|  | ||||
|   def name | ||||
|     object.display_name | ||||
|   end | ||||
| end | ||||
|   | ||||
| @@ -8,4 +8,8 @@ class REST::TagSerializer < ActiveModel::Serializer | ||||
|   def url | ||||
|     tag_url(object) | ||||
|   end | ||||
|  | ||||
|   def name | ||||
|     object.display_name | ||||
|   end | ||||
| end | ||||
|   | ||||
| @@ -75,7 +75,7 @@ | ||||
|           = link_to short_account_tag_path(@account, featured_tag.tag) do | ||||
|             %h4 | ||||
|               = fa_icon 'hashtag' | ||||
|               = featured_tag.name | ||||
|               = featured_tag.display_name | ||||
|               %small | ||||
|                 - if featured_tag.last_status_at.nil? | ||||
|                   = t('accounts.nothing_here') | ||||
|   | ||||
| @@ -28,7 +28,7 @@ RSS::Builder.build do |doc| | ||||
|       end | ||||
|  | ||||
|       status.tags.each do |tag| | ||||
|         item.category(tag.name) | ||||
|         item.category(tag.display_name) | ||||
|       end | ||||
|     end | ||||
|   end | ||||
|   | ||||
| @@ -2,7 +2,7 @@ | ||||
|   = javascript_pack_tag 'admin', async: true, crossorigin: 'anonymous' | ||||
|  | ||||
| - content_for :page_title do | ||||
|   = "##{@tag.name}" | ||||
|   = "##{@tag.display_name}" | ||||
|  | ||||
| - if current_user.can?(:view_dashboard) | ||||
|   - content_for :heading_actions do | ||||
| @@ -53,7 +53,7 @@ | ||||
|   = render 'shared/error_messages', object: @tag | ||||
|  | ||||
|   .fields-group | ||||
|     = f.input :name, wrapper: :with_block_label | ||||
|     = f.input :display_name, wrapper: :with_block_label | ||||
|  | ||||
|   .fields-group | ||||
|     = f.input :usable, as: :boolean, wrapper: :with_label | ||||
|   | ||||
| @@ -6,7 +6,7 @@ | ||||
|     .pending-account__header | ||||
|       = link_to admin_tag_path(tag.id) do | ||||
|         = fa_icon 'hashtag' | ||||
|         = tag.name | ||||
|         = tag.display_name | ||||
|  | ||||
|       %br/ | ||||
|  | ||||
|   | ||||
| @@ -1,12 +1,12 @@ | ||||
| <%= raw t('admin_mailer.new_trends.new_trending_tags.title') %> | ||||
|  | ||||
| <% @tags.each do |tag| %> | ||||
| - #<%= tag.name %> | ||||
| - #<%= tag.display_name %> | ||||
|   <%= raw t('admin.trends.tags.usage_comparison', today: tag.history.get(Time.now.utc).accounts, yesterday: tag.history.get(Time.now.utc - 1.day).accounts) %> • <%= t('admin.trends.tags.current_score', score: Trends.tags.score(tag.id).round(2)) %> | ||||
| <% end %> | ||||
|  | ||||
| <% if @lowest_trending_tag %> | ||||
| <%= raw t('admin_mailer.new_trends.new_trending_tags.requirements', lowest_tag_name: @lowest_trending_tag.name, lowest_tag_score: Trends.tags.score(@lowest_trending_tag.id).round(2), rank: Trends.tags.options[:review_threshold]) %> | ||||
| <%= raw t('admin_mailer.new_trends.new_trending_tags.requirements', lowest_tag_name: @lowest_trending_tag.display_name, lowest_tag_score: Trends.tags.score(@lowest_trending_tag.id).round(2), rank: Trends.tags.options[:review_threshold]) %> | ||||
| <% else %> | ||||
| <%= raw t('admin_mailer.new_trends.new_trending_tags.no_approved_tags') %> | ||||
| <% end %> | ||||
|   | ||||
| @@ -9,7 +9,7 @@ | ||||
|   = render 'shared/error_messages', object: @featured_tag | ||||
|  | ||||
|   .fields-group | ||||
|     = f.input :name, wrapper: :with_block_label, hint: safe_join([t('simple_form.hints.featured_tag.name'), safe_join(@recently_used_tags.map { |tag| link_to("##{tag.name}", settings_featured_tags_path(featured_tag: { name: tag.name }), method: :post) }, ', ')], ' ') | ||||
|     = f.input :name, wrapper: :with_block_label, hint: safe_join([t('simple_form.hints.featured_tag.name'), safe_join(@recently_used_tags.map { |tag| link_to("##{tag.display_name}", settings_featured_tags_path(featured_tag: { name: tag.name }), method: :post) }, ', ')], ' ') | ||||
|  | ||||
|   .actions | ||||
|     = f.button :button, t('featured_tags.add_new'), type: :submit | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| = opengraph 'og:site_name', t('about.hosted_on', domain: site_hostname) | ||||
| = opengraph 'og:url', tag_url(@tag) | ||||
| = opengraph 'og:type', 'website' | ||||
| = opengraph 'og:title', "##{@tag.name}" | ||||
| = opengraph 'og:description', strip_tags(t('about.about_hashtag_html', hashtag: @tag.name)) | ||||
| = opengraph 'og:title', "##{@tag.display_name}" | ||||
| = opengraph 'og:description', strip_tags(t('about.about_hashtag_html', hashtag: @tag.display_name)) | ||||
| = opengraph 'twitter:card', 'summary' | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
| - content_for :page_title do | ||||
|   = "##{@tag.name}" | ||||
|   = "##{@tag.display_name}" | ||||
|  | ||||
| - content_for :header_tags do | ||||
|   %meta{ name: 'robots', content: 'noindex' }/ | ||||
| @@ -9,8 +9,8 @@ | ||||
|   = render 'og' | ||||
|  | ||||
| .page-header | ||||
|   %h1= "##{@tag.name}" | ||||
|   %p= t('about.about_hashtag_html', hashtag: @tag.name) | ||||
|   %h1= "##{@tag.display_name}" | ||||
|   %p= t('about.about_hashtag_html', hashtag: @tag.display_name) | ||||
|  | ||||
| #mastodon-timeline{ data: { props: Oj.dump(default_props.merge(hashtag: @tag.name, local: @local)) }} | ||||
| .notranslate#modal-container | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| RSS::Builder.build do |doc| | ||||
|   doc.title("##{@tag.name}") | ||||
|   doc.description(I18n.t('rss.descriptions.tag', hashtag: @tag.name)) | ||||
|   doc.title("##{@tag.display_name}") | ||||
|   doc.description(I18n.t('rss.descriptions.tag', hashtag: @tag.display_name)) | ||||
|   doc.link(tag_url(@tag)) | ||||
|   doc.last_build_date(@statuses.first.created_at) if @statuses.any? | ||||
|   doc.generator("Mastodon v#{Mastodon::Version.to_s}") | ||||
| @@ -26,7 +26,7 @@ RSS::Builder.build do |doc| | ||||
|       end | ||||
|  | ||||
|       status.tags.each do |tag| | ||||
|         item.category(tag.name) | ||||
|         item.category(tag.display_name) | ||||
|       end | ||||
|     end | ||||
|   end | ||||
|   | ||||
| @@ -24,6 +24,7 @@ ActiveSupport::Inflector.inflections(:en) do |inflect| | ||||
|   inflect.acronym 'RSS' | ||||
|   inflect.acronym 'REST' | ||||
|   inflect.acronym 'URL' | ||||
|   inflect.acronym 'ASCII' | ||||
|  | ||||
|   inflect.singular 'data', 'data' | ||||
| end | ||||
|   | ||||
							
								
								
									
										5
									
								
								db/migrate/20220710102457_add_display_name_to_tags.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								db/migrate/20220710102457_add_display_name_to_tags.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,5 @@ | ||||
| class AddDisplayNameToTags < ActiveRecord::Migration[6.1] | ||||
|   def change | ||||
|     add_column :tags, :display_name, :string | ||||
|   end | ||||
| end | ||||
| @@ -10,7 +10,7 @@ | ||||
| # | ||||
| # It's strongly recommended that you check this file into your version control system. | ||||
|  | ||||
| ActiveRecord::Schema.define(version: 2022_07_04_024901) do | ||||
| ActiveRecord::Schema.define(version: 2022_07_10_102457) do | ||||
|  | ||||
|   # These are extensions that must be enabled in order to support this database | ||||
|   enable_extension "plpgsql" | ||||
| @@ -940,6 +940,7 @@ ActiveRecord::Schema.define(version: 2022_07_04_024901) do | ||||
|     t.datetime "last_status_at" | ||||
|     t.float "max_score" | ||||
|     t.datetime "max_score_at" | ||||
|     t.string "display_name" | ||||
|     t.index "lower((name)::text) text_pattern_ops", name: "index_tags_on_name_lower_btree", unique: true | ||||
|   end | ||||
|  | ||||
|   | ||||
							
								
								
									
										29
									
								
								spec/lib/hashtag_normalizer_spec.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								spec/lib/hashtag_normalizer_spec.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| # frozen_string_literal: true | ||||
|  | ||||
| require 'rails_helper' | ||||
|  | ||||
| describe HashtagNormalizer do | ||||
|   subject { described_class.new } | ||||
|  | ||||
|   describe '#normalize' do | ||||
|     it 'converts full-width Latin characters into basic Latin characters' do | ||||
|       expect(subject.normalize('Synthwave')).to eq 'synthwave' | ||||
|     end | ||||
|  | ||||
|     it 'converts half-width Katakana into Kana characters' do | ||||
|       expect(subject.normalize('シーサイドライナー')).to eq 'シーサイドライナー' | ||||
|     end | ||||
|  | ||||
|     it 'converts modified Latin characters into basic Latin characters' do | ||||
|       expect(subject.normalize('BLÅHAJ')).to eq 'blahaj' | ||||
|     end | ||||
|  | ||||
|     it 'strips out invalid characters' do | ||||
|       expect(subject.normalize('#foo')).to eq 'foo' | ||||
|     end | ||||
|  | ||||
|     it 'keeps valid characters' do | ||||
|       expect(subject.normalize('a·b')).to eq 'a·b' | ||||
|     end | ||||
|   end | ||||
| end | ||||
| @@ -91,7 +91,7 @@ RSpec.describe Tag, type: :model do | ||||
|       upcase_string   = 'abcABCabcABCやゆよ' | ||||
|       downcase_string = 'abcabcabcabcやゆよ'; | ||||
|  | ||||
|       tag = Fabricate(:tag, name: downcase_string) | ||||
|       tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string)) | ||||
|       expect(Tag.find_normalized(upcase_string)).to eq tag | ||||
|     end | ||||
|   end | ||||
| @@ -101,12 +101,12 @@ RSpec.describe Tag, type: :model do | ||||
|       upcase_string   = 'abcABCabcABCやゆよ' | ||||
|       downcase_string = 'abcabcabcabcやゆよ'; | ||||
|  | ||||
|       tag = Fabricate(:tag, name: downcase_string) | ||||
|       tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string)) | ||||
|       expect(Tag.matches_name(upcase_string)).to eq [tag] | ||||
|     end | ||||
|  | ||||
|     it 'uses the LIKE operator' do | ||||
|       expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100\\%abc%')] | ||||
|       expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100abc%')] | ||||
|     end | ||||
|   end | ||||
|  | ||||
| @@ -115,7 +115,7 @@ RSpec.describe Tag, type: :model do | ||||
|       upcase_string   = 'abcABCabcABCやゆよ' | ||||
|       downcase_string = 'abcabcabcabcやゆよ'; | ||||
|  | ||||
|       tag = Fabricate(:tag, name: downcase_string) | ||||
|       tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string)) | ||||
|       expect(Tag.matching_name(upcase_string)).to eq [tag] | ||||
|     end | ||||
|   end | ||||
|   | ||||
| @@ -892,6 +892,34 @@ const startWorker = async (workerId) => { | ||||
|     return arr; | ||||
|   }; | ||||
|  | ||||
|   /** | ||||
|    * See app/lib/ascii_folder.rb for the canon definitions | ||||
|    * of these constants | ||||
|    */ | ||||
|   const NON_ASCII_CHARS        = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'; | ||||
|   const EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'; | ||||
|  | ||||
|   /** | ||||
|    * @param {string} str | ||||
|    * @return {string} | ||||
|    */ | ||||
|   const foldToASCII = str => { | ||||
|     const regex = new RegExp(NON_ASCII_CHARS.split('').join('|'), 'g'); | ||||
|  | ||||
|     return str.replace(regex, match => { | ||||
|       const index = NON_ASCII_CHARS.indexOf(match); | ||||
|       return EQUIVALENT_ASCII_CHARS[index]; | ||||
|     }); | ||||
|   }; | ||||
|  | ||||
|   /** | ||||
|    * @param {string} str | ||||
|    * @return {string} | ||||
|    */ | ||||
|   const normalizeHashtag = str => { | ||||
|     return foldToASCII(str.normalize('NFKC').toLowerCase()).replace(/[^\p{L}\p{N}_\u00b7\u200c]/gu, ''); | ||||
|   }; | ||||
|  | ||||
|   /** | ||||
|    * @param {any} req | ||||
|    * @param {string} name | ||||
| @@ -968,7 +996,7 @@ const startWorker = async (workerId) => { | ||||
|         reject('No tag for stream provided'); | ||||
|       } else { | ||||
|         resolve({ | ||||
|           channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}`], | ||||
|           channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}`], | ||||
|           options: { needsFiltering: true }, | ||||
|         }); | ||||
|       } | ||||
| @@ -979,7 +1007,7 @@ const startWorker = async (workerId) => { | ||||
|         reject('No tag for stream provided'); | ||||
|       } else { | ||||
|         resolve({ | ||||
|           channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}:local`], | ||||
|           channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}:local`], | ||||
|           options: { needsFiltering: true }, | ||||
|         }); | ||||
|       } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user