Change spam check to apply to local accounts and add a threshold (#11806)
Instead of detecting spam on first duplicate message, add a threshold of 5 such messages to reduce false positives
This commit is contained in:
		| @@ -408,15 +408,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity | ||||
|   end | ||||
|  | ||||
|   def check_for_spam | ||||
|     spam_check = SpamCheck.new(@status) | ||||
|  | ||||
|     return if spam_check.skip? | ||||
|  | ||||
|     if spam_check.spam? | ||||
|       spam_check.flag! | ||||
|     else | ||||
|       spam_check.remember! | ||||
|     end | ||||
|     SpamCheck.perform(@status) | ||||
|   end | ||||
|  | ||||
|   def forward_for_reply | ||||
|   | ||||
| @@ -4,10 +4,26 @@ class SpamCheck | ||||
|   include Redisable | ||||
|   include ActionView::Helpers::TextHelper | ||||
|  | ||||
|   # Threshold over which two Nilsimsa values are considered | ||||
|   # to refer to the same text | ||||
|   NILSIMSA_COMPARE_THRESHOLD = 95 | ||||
|  | ||||
|   # Nilsimsa doesn't work well on small inputs, so below | ||||
|   # this size, we check only for exact matches with MD5 | ||||
|   NILSIMSA_MIN_SIZE = 10 | ||||
|  | ||||
|   # How long to keep the trail of digests between updates, | ||||
|   # there is no reason to store it forever | ||||
|   EXPIRE_SET_AFTER = 1.week.seconds | ||||
|  | ||||
|   # How many digests to keep in an account's trail. If it's | ||||
|   # too small, spam could rotate around different message templates | ||||
|   MAX_TRAIL_SIZE = 10 | ||||
|  | ||||
|   # How many detected duplicates to allow through before | ||||
|   # considering the message as spam | ||||
|   THRESHOLD = 5 | ||||
|  | ||||
|   def initialize(status) | ||||
|     @account = status.account | ||||
|     @status  = status | ||||
| @@ -21,9 +37,9 @@ class SpamCheck | ||||
|     if insufficient_data? | ||||
|       false | ||||
|     elsif nilsimsa? | ||||
|       any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } | ||||
|       digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } | ||||
|     else | ||||
|       any_other_digest?('md5') { |_, other_digest| other_digest == digest } | ||||
|       digests_over_threshold?('md5') { |_, other_digest| other_digest == digest } | ||||
|     end | ||||
|   end | ||||
|  | ||||
| @@ -38,7 +54,7 @@ class SpamCheck | ||||
|     # get the correct status ID back, we have to save it in the string value | ||||
|  | ||||
|     redis.zadd(redis_key, @status.id, digest_with_algorithm) | ||||
|     redis.zremrangebyrank(redis_key, '0', '-10') | ||||
|     redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1)) | ||||
|     redis.expire(redis_key, EXPIRE_SET_AFTER) | ||||
|   end | ||||
|  | ||||
| @@ -78,6 +94,20 @@ class SpamCheck | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   class << self | ||||
|     def perform(status) | ||||
|       spam_check = new(status) | ||||
|  | ||||
|       return if spam_check.skip? | ||||
|  | ||||
|       if spam_check.spam? | ||||
|         spam_check.flag! | ||||
|       else | ||||
|         spam_check.remember! | ||||
|       end | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   private | ||||
|  | ||||
|   def disabled? | ||||
| @@ -149,14 +179,14 @@ class SpamCheck | ||||
|     redis.zrange(redis_key, 0, -1) | ||||
|   end | ||||
|  | ||||
|   def any_other_digest?(filter_algorithm) | ||||
|     other_digests.any? do |record| | ||||
|   def digests_over_threshold?(filter_algorithm) | ||||
|     other_digests.select do |record| | ||||
|       algorithm, other_digest, status_id = record.split(':') | ||||
|  | ||||
|       next unless algorithm == filter_algorithm | ||||
|  | ||||
|       yield algorithm, other_digest, status_id | ||||
|     end | ||||
|     end.size >= THRESHOLD | ||||
|   end | ||||
|  | ||||
|   def matching_status_ids | ||||
|   | ||||
| @@ -33,6 +33,7 @@ class ProcessMentionsService < BaseService | ||||
|     end | ||||
|  | ||||
|     status.save! | ||||
|     check_for_spam(status) | ||||
|  | ||||
|     mentions.each { |mention| create_notification(mention) } | ||||
|   end | ||||
| @@ -61,4 +62,8 @@ class ProcessMentionsService < BaseService | ||||
|   def resolve_account_service | ||||
|     ResolveAccountService.new | ||||
|   end | ||||
|  | ||||
|   def check_for_spam(status) | ||||
|     SpamCheck.perform(status) | ||||
|   end | ||||
| end | ||||
|   | ||||
| @@ -86,23 +86,33 @@ RSpec.describe SpamCheck do | ||||
|     end | ||||
|  | ||||
|     it 'returns true for duplicate statuses to the same recipient' do | ||||
|       described_class::THRESHOLD.times do | ||||
|         status1 = status_with_html('@alice Hello') | ||||
|         described_class.new(status1).remember! | ||||
|       end | ||||
|  | ||||
|       status2 = status_with_html('@alice Hello') | ||||
|       expect(described_class.new(status2).spam?).to be true | ||||
|     end | ||||
|  | ||||
|     it 'returns true for duplicate statuses to different recipients' do | ||||
|       described_class::THRESHOLD.times do | ||||
|         status1 = status_with_html('@alice Hello') | ||||
|         described_class.new(status1).remember! | ||||
|       end | ||||
|  | ||||
|       status2 = status_with_html('@bob Hello') | ||||
|       expect(described_class.new(status2).spam?).to be true | ||||
|     end | ||||
|  | ||||
|     it 'returns true for nearly identical statuses with random numbers' do | ||||
|       source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.' | ||||
|  | ||||
|       described_class::THRESHOLD.times do | ||||
|         status1 = status_with_html('@alice ' + source_text + ' 1234') | ||||
|         described_class.new(status1).remember! | ||||
|       end | ||||
|  | ||||
|       status2 = status_with_html('@bob ' + source_text + ' 9568') | ||||
|       expect(described_class.new(status2).spam?).to be true | ||||
|     end | ||||
| @@ -140,9 +150,9 @@ RSpec.describe SpamCheck do | ||||
|     let(:redis_key) { spam_check.send(:redis_key) } | ||||
|  | ||||
|     it 'remembers' do | ||||
|       expect do | ||||
|       expect(Redis.current.exists(redis_key)).to be true | ||||
|       spam_check.remember! | ||||
|       end.to change { Redis.current.exists(redis_key) }.from(false).to(true) | ||||
|       expect(Redis.current.exists(redis_key)).to be true | ||||
|     end | ||||
|   end | ||||
|  | ||||
| @@ -156,9 +166,9 @@ RSpec.describe SpamCheck do | ||||
|     end | ||||
|  | ||||
|     it 'resets' do | ||||
|       expect do | ||||
|       expect(Redis.current.exists(redis_key)).to be true | ||||
|       spam_check.reset! | ||||
|       end.to change { Redis.current.exists(redis_key) }.from(true).to(false) | ||||
|       expect(Redis.current.exists(redis_key)).to be false | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user