Add more accurate hashtag search (#11579)
* Add more accurate hashtag search Using ElasticSearch to index hashtags with edge n-grams and score them by usage within the last 7 days since last activity. Only hashtags that have been reviewed and are listable can appear in searches, unless they match the query exactly * Fix search analyzer dropping non-ascii characters
This commit is contained in:
		
							
								
								
									
										37
									
								
								app/chewy/tags_index.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								app/chewy/tags_index.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,37 @@ | ||||
| # frozen_string_literal: true | ||||
|  | ||||
| class TagsIndex < Chewy::Index | ||||
|   settings index: { refresh_interval: '15m' }, analysis: { | ||||
|     analyzer: { | ||||
|       content: { | ||||
|         tokenizer: 'keyword', | ||||
|         filter: %w(lowercase asciifolding cjk_width), | ||||
|       }, | ||||
|  | ||||
|       edge_ngram: { | ||||
|         tokenizer: 'edge_ngram', | ||||
|         filter: %w(lowercase asciifolding cjk_width), | ||||
|       }, | ||||
|     }, | ||||
|  | ||||
|     tokenizer: { | ||||
|       edge_ngram: { | ||||
|         type: 'edge_ngram', | ||||
|         min_gram: 2, | ||||
|         max_gram: 15, | ||||
|       }, | ||||
|     }, | ||||
|   } | ||||
|  | ||||
|   define_type ::Tag.listable, delete_if: ->(tag) { tag.destroyed? || !tag.listable? } do | ||||
|     root date_detection: false do | ||||
|       field :name, type: 'text', analyzer: 'content' do | ||||
|         field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content' | ||||
|       end | ||||
|  | ||||
|       field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? } | ||||
|       field :usage, type: 'long', value: ->(tag) { tag.history.reduce(0) { |total, day| total + day[:accounts].to_i } } | ||||
|       field :last_status_at, type: 'date', value: ->(tag) { tag.last_status_at || tag.created_at } | ||||
|     end | ||||
|   end | ||||
| end | ||||
| @@ -13,6 +13,8 @@ | ||||
| #  listable            :boolean | ||||
| #  reviewed_at         :datetime | ||||
| #  requested_review_at :datetime | ||||
| #  last_status_at      :datetime | ||||
| #  last_trend_at       :datetime | ||||
| # | ||||
|  | ||||
| class Tag < ApplicationRecord | ||||
| @@ -33,7 +35,8 @@ class Tag < ApplicationRecord | ||||
|   scope :unreviewed, -> { where(reviewed_at: nil) } | ||||
|   scope :pending_review, -> { unreviewed.where.not(requested_review_at: nil) } | ||||
|   scope :usable, -> { where(usable: [true, nil]) } | ||||
|   scope :discoverable, -> { where(listable: [true, nil]).joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) } | ||||
|   scope :listable, -> { where(listable: [true, nil]) } | ||||
|   scope :discoverable, -> { listable.joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) } | ||||
|   scope :most_used, ->(account) { joins(:statuses).where(statuses: { account: account }).group(:id).order(Arel.sql('count(*) desc')) } | ||||
|  | ||||
|   delegate :accounts_count, | ||||
| @@ -44,6 +47,8 @@ class Tag < ApplicationRecord | ||||
|  | ||||
|   after_save :save_account_tag_stat | ||||
|  | ||||
|   update_index('tags#tag', :self) if Chewy.enabled? | ||||
|  | ||||
|   def account_tag_stat | ||||
|     super || build_account_tag_stat | ||||
|   end | ||||
| @@ -121,9 +126,10 @@ class Tag < ApplicationRecord | ||||
|       normalized_term = normalize(term.strip).mb_chars.downcase.to_s | ||||
|       pattern         = sanitize_sql_like(normalized_term) + '%' | ||||
|  | ||||
|       Tag.where(arel_table[:name].lower.matches(pattern)) | ||||
|          .where(arel_table[:score].gt(0).or(arel_table[:name].lower.eq(normalized_term))) | ||||
|          .order(Arel.sql('length(name) ASC, score DESC, name ASC')) | ||||
|       Tag.listable | ||||
|          .where(arel_table[:name].lower.matches(pattern)) | ||||
|          .where(arel_table[:name].lower.eq(normalized_term).or(arel_table[:reviewed_at].not_eq(nil))) | ||||
|          .order(Arel.sql('length(name) ASC, name ASC')) | ||||
|          .limit(limit) | ||||
|          .offset(offset) | ||||
|     end | ||||
|   | ||||
| @@ -17,6 +17,9 @@ class TrendingTags | ||||
|       increment_historical_use!(tag.id, at_time) | ||||
|       increment_unique_use!(tag.id, account.id, at_time) | ||||
|       increment_vote!(tag, at_time) | ||||
|  | ||||
|       tag.update(last_status_at: Time.now.utc) if tag.last_status_at.nil? || tag.last_status_at < 12.hours.ago | ||||
|       tag.update(last_trend_at: Time.now.utc)  if trending?(tag) && (tag.last_trend_at.nil? || tag.last_trend_at < 12.hours.ago) | ||||
|     end | ||||
|  | ||||
|     def get(limit, filtered: true) | ||||
|   | ||||
| @@ -109,7 +109,7 @@ class AccountSearchService < BaseService | ||||
|       field_value_factor: { | ||||
|         field: 'followers_count', | ||||
|         modifier: 'log2p', | ||||
|         missing: 1, | ||||
|         missing: 0, | ||||
|       }, | ||||
|     } | ||||
|   end | ||||
|   | ||||
| @@ -57,10 +57,10 @@ class SearchService < BaseService | ||||
|   end | ||||
|  | ||||
|   def perform_hashtags_search! | ||||
|     Tag.search_for( | ||||
|       @query.gsub(/\A#/, ''), | ||||
|       @limit, | ||||
|       @offset | ||||
|     TagSearchService.new.call( | ||||
|       @query, | ||||
|       limit: @limit, | ||||
|       offset: @offset | ||||
|     ) | ||||
|   end | ||||
|  | ||||
|   | ||||
							
								
								
									
										82
									
								
								app/services/tag_search_service.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								app/services/tag_search_service.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,82 @@ | ||||
| # frozen_string_literal: true | ||||
|  | ||||
| class TagSearchService < BaseService | ||||
|   def call(query, options = {}) | ||||
|     @query  = query.strip.gsub(/\A#/, '') | ||||
|     @offset = options[:offset].to_i | ||||
|     @limit  = options[:limit].to_i | ||||
|  | ||||
|     if Chewy.enabled? | ||||
|       from_elasticsearch | ||||
|     else | ||||
|       from_database | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   private | ||||
|  | ||||
|   def from_elasticsearch | ||||
|     query = { | ||||
|       function_score: { | ||||
|         query: { | ||||
|           multi_match: { | ||||
|             query: @query, | ||||
|             fields: %w(name.edge_ngram name), | ||||
|             type: 'most_fields', | ||||
|             operator: 'and', | ||||
|           }, | ||||
|         }, | ||||
|  | ||||
|         functions: [ | ||||
|           { | ||||
|             field_value_factor: { | ||||
|               field: 'usage', | ||||
|               modifier: 'log2p', | ||||
|               missing: 0, | ||||
|             }, | ||||
|           }, | ||||
|  | ||||
|           { | ||||
|             gauss: { | ||||
|               last_status_at: { | ||||
|                 scale: '7d', | ||||
|                 offset: '14d', | ||||
|                 decay: 0.5, | ||||
|               }, | ||||
|             }, | ||||
|           }, | ||||
|         ], | ||||
|  | ||||
|         boost_mode: 'multiply', | ||||
|       }, | ||||
|     } | ||||
|  | ||||
|     filter = { | ||||
|       bool: { | ||||
|         should: [ | ||||
|           { | ||||
|             term: { | ||||
|               reviewed: { | ||||
|                 value: true, | ||||
|               }, | ||||
|             }, | ||||
|           }, | ||||
|  | ||||
|           { | ||||
|             term: { | ||||
|               name: { | ||||
|                 value: @query, | ||||
|               }, | ||||
|             }, | ||||
|           }, | ||||
|         ], | ||||
|       }, | ||||
|     } | ||||
|  | ||||
|     TagsIndex.query(query).filter(filter).limit(@limit).offset(@offset).objects.compact | ||||
|   end | ||||
|  | ||||
|   def from_database | ||||
|     Tag.search_for(@query, @limit, @offset) | ||||
|   end | ||||
| end | ||||
| @@ -142,7 +142,7 @@ en: | ||||
|         report: Send e-mail when a new report is submitted | ||||
|         trending_tag: Send e-mail when an unreviewed hashtag is trending | ||||
|       tag: | ||||
|         listable: Allow this hashtag to appear on the profile directory | ||||
|         listable: Allow this hashtag to appear in searches and on the profile directory | ||||
|         trendable: Allow this hashtag to appear under trends | ||||
|         usable: Allow toots to use this hashtag | ||||
|     'no': 'No' | ||||
|   | ||||
							
								
								
									
										6
									
								
								db/migrate/20190815225426_add_last_status_at_to_tags.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								db/migrate/20190815225426_add_last_status_at_to_tags.rb
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| class AddLastStatusAtToTags < ActiveRecord::Migration[5.2] | ||||
|   def change | ||||
|     add_column :tags, :last_status_at, :datetime | ||||
|     add_column :tags, :last_trend_at, :datetime | ||||
|   end | ||||
| end | ||||
| @@ -10,7 +10,7 @@ | ||||
| # | ||||
| # It's strongly recommended that you check this file into your version control system. | ||||
|  | ||||
| ActiveRecord::Schema.define(version: 2019_08_07_135426) do | ||||
| ActiveRecord::Schema.define(version: 2019_08_15_225426) do | ||||
|  | ||||
|   # These are extensions that must be enabled in order to support this database | ||||
|   enable_extension "plpgsql" | ||||
| @@ -667,6 +667,8 @@ ActiveRecord::Schema.define(version: 2019_08_07_135426) do | ||||
|     t.boolean "listable" | ||||
|     t.datetime "reviewed_at" | ||||
|     t.datetime "requested_review_at" | ||||
|     t.datetime "last_status_at" | ||||
|     t.datetime "last_trend_at" | ||||
|     t.index "lower((name)::text)", name: "index_tags_on_name_lower", unique: true | ||||
|   end | ||||
|  | ||||
|   | ||||
| @@ -136,8 +136,8 @@ RSpec.describe Tag, type: :model do | ||||
|     end | ||||
|  | ||||
|     it 'finds the exact matching tag as the first item' do | ||||
|       similar_tag = Fabricate(:tag, name: "matchlater", score: 1) | ||||
|       tag = Fabricate(:tag, name: "match", score: 1) | ||||
|       similar_tag = Fabricate(:tag, name: "matchlater", reviewed_at: Time.now.utc) | ||||
|       tag = Fabricate(:tag, name: "match", reviewed_at: Time.now.utc) | ||||
|  | ||||
|       results = Tag.search_for("match") | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user