Remove language detection through cld3 (#17478)
* Remove language detection through cld3 * Update app/helpers/languages_helper.rb Co-authored-by: Yamagishi Kazutoshi <ykzts@desire.sh> Co-authored-by: Yamagishi Kazutoshi <ykzts@desire.sh>
This commit is contained in:
		
							
								
								
									
										2
									
								
								Gemfile
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								Gemfile
									
									
									
									
									
								
							| @@ -29,9 +29,7 @@ gem 'addressable', '~> 2.8' | ||||
| gem 'bootsnap', '~> 1.10.2', require: false | ||||
| gem 'browser' | ||||
| gem 'charlock_holmes', '~> 0.7.7' | ||||
| gem 'iso-639' | ||||
| gem 'chewy', '~> 7.2' | ||||
| gem 'cld3', '~> 3.4.4' | ||||
| gem 'devise', '~> 4.8' | ||||
| gem 'devise-two-factor', '~> 4.0' | ||||
|  | ||||
|   | ||||
| @@ -152,8 +152,6 @@ GEM | ||||
|       elasticsearch (>= 7.12.0, < 7.14.0) | ||||
|       elasticsearch-dsl | ||||
|     chunky_png (1.4.0) | ||||
|     cld3 (3.4.4) | ||||
|       ffi (>= 1.1.0, < 1.16.0) | ||||
|     climate_control (0.2.0) | ||||
|     coderay (1.1.3) | ||||
|     color_diff (0.1) | ||||
| @@ -301,7 +299,6 @@ GEM | ||||
|       terminal-table (>= 1.5.1) | ||||
|     idn-ruby (0.1.4) | ||||
|     ipaddress (0.8.3) | ||||
|     iso-639 (0.3.5) | ||||
|     jmespath (1.5.0) | ||||
|     json (2.5.1) | ||||
|     json-canonicalization (0.3.0) | ||||
| @@ -698,7 +695,6 @@ DEPENDENCIES | ||||
|   capybara (~> 3.36) | ||||
|   charlock_holmes (~> 0.7.7) | ||||
|   chewy (~> 7.2) | ||||
|   cld3 (~> 3.4.4) | ||||
|   climate_control (~> 0.2) | ||||
|   color_diff (~> 0.1) | ||||
|   concurrent-ruby | ||||
| @@ -725,7 +721,6 @@ DEPENDENCIES | ||||
|   httplog (~> 1.5.0) | ||||
|   i18n-tasks (~> 0.9) | ||||
|   idn-ruby | ||||
|   iso-639 | ||||
|   json-ld | ||||
|   json-ld-preloaded (~> 3.2) | ||||
|   kaminari (~> 1.2) | ||||
|   | ||||
| @@ -1,94 +1,237 @@ | ||||
| # frozen_string_literal: true | ||||
|  | ||||
| module LanguagesHelper | ||||
|   HUMAN_LOCALES = { | ||||
|     af: 'Afrikaans', | ||||
|     ar: 'العربية', | ||||
|     ast: 'Asturianu', | ||||
|     bg: 'Български', | ||||
|     bn: 'বাংলা', | ||||
|     br: 'Breton', | ||||
|     ca: 'Català', | ||||
|     co: 'Corsu', | ||||
|     cs: 'Čeština', | ||||
|     cy: 'Cymraeg', | ||||
|     da: 'Dansk', | ||||
|     de: 'Deutsch', | ||||
|     el: 'Ελληνικά', | ||||
|     en: 'English', | ||||
|     eo: 'Esperanto', | ||||
|   ISO_639_1 = { | ||||
|     aa: ['Afar', 'Afaraf'].freeze, | ||||
|     ab: ['Abkhaz', 'аҧсуа бызшәа'].freeze, | ||||
|     ae: ['Avestan', 'avesta'].freeze, | ||||
|     af: ['Afrikaans', 'Afrikaans'].freeze, | ||||
|     ak: ['Akan', 'Akan'].freeze, | ||||
|     am: ['Amharic', 'አማርኛ'].freeze, | ||||
|     an: ['Aragonese', 'aragonés'].freeze, | ||||
|     ar: ['Arabic', 'اللغة العربية'].freeze, | ||||
|     as: ['Assamese', 'অসমীয়া'].freeze, | ||||
|     av: ['Avaric', 'авар мацӀ'].freeze, | ||||
|     ay: ['Aymara', 'aymar aru'].freeze, | ||||
|     az: ['Azerbaijani', 'azərbaycan dili'].freeze, | ||||
|     ba: ['Bashkir', 'башҡорт теле'].freeze, | ||||
|     be: ['Belarusian', 'беларуская мова'].freeze, | ||||
|     bg: ['Bulgarian', 'български език'].freeze, | ||||
|     bh: ['Bihari', 'भोजपुरी'].freeze, | ||||
|     bi: ['Bislama', 'Bislama'].freeze, | ||||
|     bm: ['Bambara', 'bamanankan'].freeze, | ||||
|     bn: ['Bengali', 'বাংলা'].freeze, | ||||
|     bo: ['Tibetan', 'བོད་ཡིག'].freeze, | ||||
|     br: ['Breton', 'brezhoneg'].freeze, | ||||
|     bs: ['Bosnian', 'bosanski jezik'].freeze, | ||||
|     ca: ['Catalan', 'Català'].freeze, | ||||
|     ce: ['Chechen', 'нохчийн мотт'].freeze, | ||||
|     ch: ['Chamorro', 'Chamoru'].freeze, | ||||
|     co: ['Corsican', 'corsu'].freeze, | ||||
|     cr: ['Cree', 'ᓀᐦᐃᔭᐍᐏᐣ'].freeze, | ||||
|     cs: ['Czech', 'čeština'].freeze, | ||||
|     cu: ['Old Church Slavonic', 'ѩзыкъ словѣньскъ'].freeze, | ||||
|     cv: ['Chuvash', 'чӑваш чӗлхи'].freeze, | ||||
|     cy: ['Welsh', 'Cymraeg'].freeze, | ||||
|     da: ['Danish', 'dansk'].freeze, | ||||
|     de: ['German', 'Deutsch'].freeze, | ||||
|     dv: ['Divehi', 'Dhivehi'].freeze, | ||||
|     dz: ['Dzongkha', 'རྫོང་ཁ'].freeze, | ||||
|     ee: ['Ewe', 'Eʋegbe'].freeze, | ||||
|     el: ['Greek', 'Ελληνικά'].freeze, | ||||
|     en: ['English', 'English'].freeze, | ||||
|     eo: ['Esperanto', 'Esperanto'].freeze, | ||||
|     es: ['Spanish', 'Español'].freeze, | ||||
|     et: ['Estonian', 'eesti'].freeze, | ||||
|     eu: ['Basque', 'euskara'].freeze, | ||||
|     fa: ['Persian', 'فارسی'].freeze, | ||||
|     ff: ['Fula', 'Fulfulde'].freeze, | ||||
|     fi: ['Finnish', 'suomi'].freeze, | ||||
|     fj: ['Fijian', 'Vakaviti'].freeze, | ||||
|     fo: ['Faroese', 'føroyskt'].freeze, | ||||
|     fr: ['French', 'Français'].freeze, | ||||
|     fy: ['Western Frisian', 'Frysk'].freeze, | ||||
|     ga: ['Irish', 'Gaeilge'].freeze, | ||||
|     gd: ['Scottish Gaelic', 'Gàidhlig'].freeze, | ||||
|     gl: ['Galician', 'galego'].freeze, | ||||
|     gu: ['Gujarati', 'ગુજરાતી'].freeze, | ||||
|     gv: ['Manx', 'Gaelg'].freeze, | ||||
|     ha: ['Hausa', 'هَوُسَ'].freeze, | ||||
|     he: ['Hebrew', 'עברית'].freeze, | ||||
|     hi: ['Hindi', 'हिन्दी'].freeze, | ||||
|     ho: ['Hiri Motu', 'Hiri Motu'].freeze, | ||||
|     hr: ['Croatian', 'Hrvatski'].freeze, | ||||
|     ht: ['Haitian', 'Kreyòl ayisyen'].freeze, | ||||
|     hu: ['Hungarian', 'magyar'].freeze, | ||||
|     hy: ['Armenian', 'Հայերեն'].freeze, | ||||
|     hz: ['Herero', 'Otjiherero'].freeze, | ||||
|     ia: ['Interlingua', 'Interlingua'].freeze, | ||||
|     id: ['Indonesian', 'Bahasa Indonesia'].freeze, | ||||
|     ie: ['Interlingue', 'Interlingue'].freeze, | ||||
|     ig: ['Igbo', 'Asụsụ Igbo'].freeze, | ||||
|     ii: ['Nuosu', 'ꆈꌠ꒿ Nuosuhxop'].freeze, | ||||
|     ik: ['Inupiaq', 'Iñupiaq'].freeze, | ||||
|     io: ['Ido', 'Ido'].freeze, | ||||
|     is: ['Icelandic', 'Íslenska'].freeze, | ||||
|     it: ['Italian', 'Italiano'].freeze, | ||||
|     iu: ['Inuktitut', 'ᐃᓄᒃᑎᑐᑦ'].freeze, | ||||
|     ja: ['Japanese', '日本語'].freeze, | ||||
|     jv: ['Javanese', 'basa Jawa'].freeze, | ||||
|     ka: ['Georgian', 'ქართული'].freeze, | ||||
|     kg: ['Kongo', 'Kikongo'].freeze, | ||||
|     ki: ['Kikuyu', 'Gĩkũyũ'].freeze, | ||||
|     kj: ['Kwanyama', 'Kuanyama'].freeze, | ||||
|     kk: ['Kazakh', 'қазақ тілі'].freeze, | ||||
|     kl: ['Kalaallisut', 'kalaallisut'].freeze, | ||||
|     km: ['Khmer', 'ខេមរភាសា'].freeze, | ||||
|     kn: ['Kannada', 'ಕನ್ನಡ'].freeze, | ||||
|     ko: ['Korean', '한국어'].freeze, | ||||
|     kr: ['Kanuri', 'Kanuri'].freeze, | ||||
|     ks: ['Kashmiri', 'कश्मीरी'].freeze, | ||||
|     ku: ['Kurdish', 'Kurdî'].freeze, | ||||
|     kv: ['Komi', 'коми кыв'].freeze, | ||||
|     kw: ['Cornish', 'Kernewek'].freeze, | ||||
|     ky: ['Kyrgyz', 'Кыргызча'].freeze, | ||||
|     la: ['Latin', 'latine'].freeze, | ||||
|     lb: ['Luxembourgish', 'Lëtzebuergesch'].freeze, | ||||
|     lg: ['Ganda', 'Luganda'].freeze, | ||||
|     li: ['Limburgish', 'Limburgs'].freeze, | ||||
|     ln: ['Lingala', 'Lingála'].freeze, | ||||
|     lo: ['Lao', 'ພາສາ'].freeze, | ||||
|     lt: ['Lithuanian', 'lietuvių kalba'].freeze, | ||||
|     lu: ['Luba-Katanga', 'Tshiluba'].freeze, | ||||
|     lv: ['Latvian', 'latviešu valoda'].freeze, | ||||
|     mg: ['Malagasy', 'fiteny malagasy'].freeze, | ||||
|     mh: ['Marshallese', 'Kajin M̧ajeļ'].freeze, | ||||
|     mi: ['Māori', 'te reo Māori'].freeze, | ||||
|     mk: ['Macedonian', 'македонски јазик'].freeze, | ||||
|     ml: ['Malayalam', 'മലയാളം'].freeze, | ||||
|     mn: ['Mongolian', 'Монгол хэл'].freeze, | ||||
|     mr: ['Marathi', 'मराठी'].freeze, | ||||
|     ms: ['Malay', 'Bahasa Malaysia'].freeze, | ||||
|     mt: ['Maltese', 'Malti'].freeze, | ||||
|     my: ['Burmese', 'ဗမာစာ'].freeze, | ||||
|     na: ['Nauru', 'Ekakairũ Naoero'].freeze, | ||||
|     nb: ['Norwegian Bokmål', 'Norsk bokmål'].freeze, | ||||
|     nd: ['Northern Ndebele', 'isiNdebele'].freeze, | ||||
|     ne: ['Nepali', 'नेपाली'].freeze, | ||||
|     ng: ['Ndonga', 'Owambo'].freeze, | ||||
|     nl: ['Dutch', 'Nederlands'].freeze, | ||||
|     nn: ['Norwegian Nynorsk', 'Norsk nynorsk'].freeze, | ||||
|     no: ['Norwegian', 'Norsk'].freeze, | ||||
|     nr: ['Southern Ndebele', 'isiNdebele'].freeze, | ||||
|     nv: ['Navajo', 'Diné bizaad'].freeze, | ||||
|     ny: ['Chichewa', 'chiCheŵa'].freeze, | ||||
|     oc: ['Occitan', 'occitan'].freeze, | ||||
|     oj: ['Ojibwe', 'ᐊᓂᔑᓈᐯᒧᐎᓐ'].freeze, | ||||
|     om: ['Oromo', 'Afaan Oromoo'].freeze, | ||||
|     or: ['Oriya', 'ଓଡ଼ିଆ'].freeze, | ||||
|     os: ['Ossetian', 'ирон æвзаг'].freeze, | ||||
|     pa: ['Panjabi', 'ਪੰਜਾਬੀ'].freeze, | ||||
|     pi: ['Pāli', 'पाऴि'].freeze, | ||||
|     pl: ['Polish', 'Polski'].freeze, | ||||
|     ps: ['Pashto', 'پښتو'].freeze, | ||||
|     pt: ['Portuguese', 'Português'].freeze, | ||||
|     qu: ['Quechua', 'Runa Simi'].freeze, | ||||
|     rm: ['Romansh', 'rumantsch grischun'].freeze, | ||||
|     rn: ['Kirundi', 'Ikirundi'].freeze, | ||||
|     ro: ['Romanian', 'Română'].freeze, | ||||
|     ru: ['Russian', 'Русский'].freeze, | ||||
|     rw: ['Kinyarwanda', 'Ikinyarwanda'].freeze, | ||||
|     sa: ['Sanskrit', 'संस्कृतम्'].freeze, | ||||
|     sc: ['Sardinian', 'sardu'].freeze, | ||||
|     sd: ['Sindhi', 'सिन्धी'].freeze, | ||||
|     se: ['Northern Sami', 'Davvisámegiella'].freeze, | ||||
|     sg: ['Sango', 'yângâ tî sängö'].freeze, | ||||
|     si: ['Sinhala', 'සිංහල'].freeze, | ||||
|     sk: ['Slovak', 'slovenčina'].freeze, | ||||
|     sl: ['Slovenian', 'slovenščina'].freeze, | ||||
|     sn: ['Shona', 'chiShona'].freeze, | ||||
|     so: ['Somali', 'Soomaaliga'].freeze, | ||||
|     sq: ['Albanian', 'Shqip'].freeze, | ||||
|     sr: ['Serbian', 'српски језик'].freeze, | ||||
|     ss: ['Swati', 'SiSwati'].freeze, | ||||
|     st: ['Southern Sotho', 'Sesotho'].freeze, | ||||
|     su: ['Sundanese', 'Basa Sunda'].freeze, | ||||
|     sv: ['Swedish', 'Svenska'].freeze, | ||||
|     sw: ['Swahili', 'Kiswahili'].freeze, | ||||
|     ta: ['Tamil', 'தமிழ்'].freeze, | ||||
|     te: ['Telugu', 'తెలుగు'].freeze, | ||||
|     tg: ['Tajik', 'тоҷикӣ'].freeze, | ||||
|     th: ['Thai', 'ไทย'].freeze, | ||||
|     ti: ['Tigrinya', 'ትግርኛ'].freeze, | ||||
|     tk: ['Turkmen', 'Türkmen'].freeze, | ||||
|     tl: ['Tagalog', 'Wikang Tagalog'].freeze, | ||||
|     tn: ['Tswana', 'Setswana'].freeze, | ||||
|     to: ['Tonga', 'faka Tonga'].freeze, | ||||
|     tr: ['Turkish', 'Türkçe'].freeze, | ||||
|     ts: ['Tsonga', 'Xitsonga'].freeze, | ||||
|     tt: ['Tatar', 'татар теле'].freeze, | ||||
|     tw: ['Twi', 'Twi'].freeze, | ||||
|     ty: ['Tahitian', 'Reo Tahiti'].freeze, | ||||
|     ug: ['Uyghur', 'ئۇيغۇرچە'].freeze, | ||||
|     uk: ['Ukrainian', 'Українська'].freeze, | ||||
|     ur: ['Urdu', 'اردو'].freeze, | ||||
|     uz: ['Uzbek', 'Ўзбек'].freeze, | ||||
|     ve: ['Venda', 'Tshivenḓa'].freeze, | ||||
|     vi: ['Vietnamese', 'Tiếng Việt'].freeze, | ||||
|     vo: ['Volapük', 'Volapük'].freeze, | ||||
|     wa: ['Walloon', 'walon'].freeze, | ||||
|     wo: ['Wolof', 'Wollof'].freeze, | ||||
|     xh: ['Xhosa', 'isiXhosa'].freeze, | ||||
|     yi: ['Yiddish', 'ייִדיש'].freeze, | ||||
|     yo: ['Yoruba', 'Yorùbá'].freeze, | ||||
|     za: ['Zhuang', 'Saɯ cueŋƅ'].freeze, | ||||
|     zh: ['Chinese', '中文'].freeze, | ||||
|     zu: ['Zulu', 'isiZulu'].freeze, | ||||
|   }.freeze | ||||
|  | ||||
|   ISO_639_3 = { | ||||
|     ast: ['Asturian', 'Asturianu'].freeze, | ||||
|     kab: ['Kabyle', 'Taqbaylit'].freeze, | ||||
|     kmr: ['Northern Kurdish', 'Kurmancî'].freeze, | ||||
|     zgh: ['Standard Moroccan Tamazight', 'ⵜⴰⵎⴰⵣⵉⵖⵜ'].freeze, | ||||
|   }.freeze | ||||
|  | ||||
|   SUPPORTED_LOCALES = {}.merge(ISO_639_1).merge(ISO_639_3).freeze | ||||
|  | ||||
|   # For ISO-639-1 and ISO-639-3 language codes, we have their official | ||||
|   # names, but for some translations, we need the names of the | ||||
|   # regional variants specifically | ||||
|   REGIONAL_LOCALE_NAMES = { | ||||
|     'es-AR': 'Español (Argentina)', | ||||
|     'es-MX': 'Español (México)', | ||||
|     es: 'Español', | ||||
|     et: 'Eesti', | ||||
|     eu: 'Euskara', | ||||
|     fa: 'فارسی', | ||||
|     fi: 'Suomi', | ||||
|     fr: 'Français', | ||||
|     ga: 'Gaeilge', | ||||
|     gd: 'Gàidhlig', | ||||
|     gl: 'Galego', | ||||
|     he: 'עברית', | ||||
|     hi: 'हिन्दी', | ||||
|     hr: 'Hrvatski', | ||||
|     hu: 'Magyar', | ||||
|     hy: 'Հայերեն', | ||||
|     id: 'Bahasa Indonesia', | ||||
|     io: 'Ido', | ||||
|     is: 'Íslenska', | ||||
|     it: 'Italiano', | ||||
|     ja: '日本語', | ||||
|     ka: 'ქართული', | ||||
|     kab: 'Taqbaylit', | ||||
|     kk: 'Қазақша', | ||||
|     kmr: 'Kurmancî', | ||||
|     kn: 'ಕನ್ನಡ', | ||||
|     ko: '한국어', | ||||
|     ku: 'سۆرانی', | ||||
|     lt: 'Lietuvių', | ||||
|     lv: 'Latviešu', | ||||
|     mk: 'Македонски', | ||||
|     ml: 'മലയാളം', | ||||
|     mr: 'मराठी', | ||||
|     ms: 'Bahasa Melayu', | ||||
|     nl: 'Nederlands', | ||||
|     nn: 'Nynorsk', | ||||
|     no: 'Norsk', | ||||
|     oc: 'Occitan', | ||||
|     pl: 'Polski', | ||||
|     'pt-BR': 'Português (Brasil)', | ||||
|     'pt-PT': 'Português (Portugal)', | ||||
|     pt: 'Português', | ||||
|     ro: 'Română', | ||||
|     ru: 'Русский', | ||||
|     sa: 'संस्कृतम्', | ||||
|     sc: 'Sardu', | ||||
|     si: 'සිංහල', | ||||
|     sk: 'Slovenčina', | ||||
|     sl: 'Slovenščina', | ||||
|     sq: 'Shqip', | ||||
|     'sr-Latn': 'Srpski (latinica)', | ||||
|     sr: 'Српски', | ||||
|     sv: 'Svenska', | ||||
|     ta: 'தமிழ்', | ||||
|     te: 'తెలుగు', | ||||
|     th: 'ไทย', | ||||
|     tr: 'Türkçe', | ||||
|     uk: 'Українська', | ||||
|     ur: 'اُردُو', | ||||
|     vi: 'Tiếng Việt', | ||||
|     zgh: 'ⵜⴰⵎⴰⵣⵉⵖⵜ', | ||||
|     'zh-CN': '简体中文', | ||||
|     'zh-HK': '繁體中文(香港)', | ||||
|     'zh-TW': '繁體中文(臺灣)', | ||||
|     zh: '中文', | ||||
|   }.freeze | ||||
|  | ||||
|   def human_locale(locale) | ||||
|     if locale == 'und' | ||||
|       I18n.t('generic.none') | ||||
|     elsif (supported_locale = SUPPORTED_LOCALES[locale.to_sym]) | ||||
|       supported_locale[1] | ||||
|     elsif (regional_locale = REGIONAL_LOCALE_NAMES[locale.to_sym]) | ||||
|       regional_locale | ||||
|     else | ||||
|       HUMAN_LOCALES[locale.to_sym] || locale | ||||
|       locale | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   def valid_locale_or_nil(str) | ||||
|     return if str.blank? | ||||
|  | ||||
|     code, = str.to_s.split(/[_-]/) # Strip out the region from e.g. en_US or ja-JP | ||||
|  | ||||
|     return unless valid_locale?(code) | ||||
|  | ||||
|     code | ||||
|   end | ||||
|  | ||||
|   def valid_locale?(locale) | ||||
|     SUPPORTED_LOCALES.key?(locale.to_sym) | ||||
|   end | ||||
| end | ||||
|   | ||||
| @@ -2,7 +2,7 @@ | ||||
|  | ||||
| module SettingsHelper | ||||
|   def filterable_languages | ||||
|     LanguageDetector.instance.language_names.select(&LanguagesHelper::HUMAN_LOCALES.method(:key?)) | ||||
|     LanguagesHelper::SUPPORTED_LOCALES.keys | ||||
|   end | ||||
|  | ||||
|   def hash_to_object(hash) | ||||
|   | ||||
| @@ -112,7 +112,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity | ||||
|         url: @status_parser.url || @status_parser.uri, | ||||
|         account: @account, | ||||
|         text: converted_object_type? ? converted_text : (@status_parser.text || ''), | ||||
|         language: @status_parser.language || detected_language, | ||||
|         language: @status_parser.language, | ||||
|         spoiler_text: converted_object_type? ? '' : (@status_parser.spoiler_text || ''), | ||||
|         created_at: @status_parser.created_at, | ||||
|         edited_at: @status_parser.edited_at, | ||||
| @@ -370,10 +370,6 @@ class ActivityPub::Activity::Create < ActivityPub::Activity | ||||
|     Formatter.instance.linkify([@status_parser.title.presence, @status_parser.spoiler_text.presence, @status_parser.url || @status_parser.uri].compact.join("\n\n")) | ||||
|   end | ||||
|  | ||||
|   def detected_language | ||||
|     LanguageDetector.instance.detect(@status_parser.text, @account) if supported_object_type? | ||||
|   end | ||||
|  | ||||
|   def unsupported_media_type?(mime_type) | ||||
|     mime_type.present? && !MediaAttachment.supported_mime_types.include?(mime_type) | ||||
|   end | ||||
|   | ||||
| @@ -1,101 +0,0 @@ | ||||
| # frozen_string_literal: true | ||||
|  | ||||
| class LanguageDetector | ||||
|   include Singleton | ||||
|  | ||||
|   WORDS_THRESHOLD        = 4 | ||||
|   RELIABLE_CHARACTERS_RE = /[\p{Hebrew}\p{Arabic}\p{Syriac}\p{Thaana}\p{Nko}\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}\p{Thai}]+/m | ||||
|  | ||||
|   def initialize | ||||
|     @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048) | ||||
|   end | ||||
|  | ||||
|   def detect(text, account) | ||||
|     input_text = prepare_text(text) | ||||
|  | ||||
|     return if input_text.blank? | ||||
|  | ||||
|     detect_language_code(input_text) || default_locale(account) | ||||
|   end | ||||
|  | ||||
|   def language_names | ||||
|     @language_names = CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }.uniq | ||||
|   end | ||||
|  | ||||
|   private | ||||
|  | ||||
|   def prepare_text(text) | ||||
|     simplify_text(text).strip | ||||
|   end | ||||
|  | ||||
|   def unreliable_input?(text) | ||||
|     !reliable_input?(text) | ||||
|   end | ||||
|  | ||||
|   def reliable_input?(text) | ||||
|     sufficient_text_length?(text) || language_specific_character_set?(text) | ||||
|   end | ||||
|  | ||||
|   def sufficient_text_length?(text) | ||||
|     text.split(/\s+/).size >= WORDS_THRESHOLD | ||||
|   end | ||||
|  | ||||
|   def language_specific_character_set?(text) | ||||
|     words = text.scan(RELIABLE_CHARACTERS_RE) | ||||
|  | ||||
|     if words.present? | ||||
|       words.reduce(0) { |acc, elem| acc + elem.size }.to_f / text.size > 0.3 | ||||
|     else | ||||
|       false | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   def detect_language_code(text) | ||||
|     return if unreliable_input?(text) | ||||
|  | ||||
|     result = @identifier.find_language(text) | ||||
|  | ||||
|     iso6391(result.language.to_s).to_sym if result&.reliable? | ||||
|   end | ||||
|  | ||||
|   def iso6391(bcp47) | ||||
|     iso639 = bcp47.split('-').first | ||||
|  | ||||
|     # CLD3 returns grandfathered language code for Hebrew | ||||
|     return 'he' if iso639 == 'iw' | ||||
|  | ||||
|     ISO_639.find(iso639).alpha2 | ||||
|   end | ||||
|  | ||||
|   def simplify_text(text) | ||||
|     new_text = remove_html(text) | ||||
|     new_text.gsub!(FetchLinkCardService::URL_PATTERN, '\1') | ||||
|     new_text.gsub!(Account::MENTION_RE, '') | ||||
|     new_text.gsub!(Tag::HASHTAG_RE) { |string| string.gsub(/[#_]/, '#' => '', '_' => ' ').gsub(/[a-z][A-Z]|[a-zA-Z][\d]/) { |s| s.insert(1, ' ') }.downcase } | ||||
|     new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '') | ||||
|     new_text.gsub!(/\s+/, ' ') | ||||
|     new_text | ||||
|   end | ||||
|  | ||||
|   def new_scrubber | ||||
|     scrubber = Rails::Html::PermitScrubber.new | ||||
|     scrubber.tags = %w(br p) | ||||
|     scrubber | ||||
|   end | ||||
|  | ||||
|   def scrubber | ||||
|     @scrubber ||= new_scrubber | ||||
|   end | ||||
|  | ||||
|   def remove_html(text) | ||||
|     text = Loofah.fragment(text).scrub!(scrubber).to_s | ||||
|     text.gsub!('<br>', "\n") | ||||
|     text.gsub!('</p><p>', "\n\n") | ||||
|     text.gsub!(/(^<p>|<\/p>$)/, '') | ||||
|     text | ||||
|   end | ||||
|  | ||||
|   def default_locale(account) | ||||
|     account.user_locale&.to_sym || I18n.default_locale if account.local? | ||||
|   end | ||||
| end | ||||
| @@ -2,6 +2,7 @@ | ||||
|  | ||||
| class LinkDetailsExtractor | ||||
|   include ActionView::Helpers::TagHelper | ||||
|   include LanguagesHelper | ||||
|  | ||||
|   # Some publications wrap their JSON-LD data in their <script> tags | ||||
|   # in commented-out CDATA blocks, they need to be removed before | ||||
| @@ -218,14 +219,6 @@ class LinkDetailsExtractor | ||||
|     nil | ||||
|   end | ||||
|  | ||||
|   def valid_locale_or_nil(str) | ||||
|     return nil if str.blank? | ||||
|  | ||||
|     code,  = str.split(/_-/) # Strip out the region from e.g. en_US or ja-JA | ||||
|     locale = ISO_639.find(code) | ||||
|     locale&.alpha2 | ||||
|   end | ||||
|  | ||||
|   def link_tag(name) | ||||
|     document.xpath("//link[@rel=\"#{name}\"]").map { |link| link['href'] }.first | ||||
|   end | ||||
|   | ||||
| @@ -245,6 +245,10 @@ class User < ApplicationRecord | ||||
|     save! | ||||
|   end | ||||
|  | ||||
|   def preferred_posting_language | ||||
|     settings.default_language || locale | ||||
|   end | ||||
|  | ||||
|   def setting_default_privacy | ||||
|     settings.default_privacy || (account.locked? ? 'private' : 'public') | ||||
|   end | ||||
|   | ||||
| @@ -120,7 +120,7 @@ class ActivityPub::ProcessStatusUpdateService < BaseService | ||||
|     @status.text         = @status_parser.text || '' | ||||
|     @status.spoiler_text = @status_parser.spoiler_text || '' | ||||
|     @status.sensitive    = @account.sensitized? || @status_parser.sensitive || false | ||||
|     @status.language     = @status_parser.language || detected_language | ||||
|     @status.language     = @status_parser.language | ||||
|     @status.edited_at    = @status_parser.edited_at || Time.now.utc if significant_changes? | ||||
|  | ||||
|     @status.save! | ||||
| @@ -210,10 +210,6 @@ class ActivityPub::ProcessStatusUpdateService < BaseService | ||||
|     { redis: Redis.current, key: "create:#{@uri}", autorelease: 15.minutes.seconds } | ||||
|   end | ||||
|  | ||||
|   def detected_language | ||||
|     LanguageDetector.instance.detect(@status_parser.text, @account) | ||||
|   end | ||||
|  | ||||
|   def create_previous_edit! | ||||
|     # We only need to create a previous edit when no previous edits exist, e.g. | ||||
|     # when the status has never been edited. For other cases, we always create | ||||
|   | ||||
| @@ -2,6 +2,7 @@ | ||||
|  | ||||
| class PostStatusService < BaseService | ||||
|   include Redisable | ||||
|   include LanguagesHelper | ||||
|  | ||||
|   MIN_SCHEDULE_OFFSET = 5.minutes.freeze | ||||
|  | ||||
| @@ -109,10 +110,6 @@ class PostStatusService < BaseService | ||||
|     raise Mastodon::ValidationError, I18n.t('media_attachments.validations.not_ready') if @media.any?(&:not_processed?) | ||||
|   end | ||||
|  | ||||
|   def language_from_option(str) | ||||
|     ISO_639.find(str)&.alpha2 | ||||
|   end | ||||
|  | ||||
|   def process_mentions_service | ||||
|     ProcessMentionsService.new | ||||
|   end | ||||
| @@ -165,7 +162,7 @@ class PostStatusService < BaseService | ||||
|       sensitive: @sensitive, | ||||
|       spoiler_text: @options[:spoiler_text] || '', | ||||
|       visibility: @visibility, | ||||
|       language: language_from_option(@options[:language]) || @account.user&.setting_default_language&.presence || LanguageDetector.instance.detect(@text, @account), | ||||
|       language: valid_locale_or_nil(@options[:language].presence || @account.user&.preferred_posting_language || I18n.default_locale), | ||||
|       application: @options[:application], | ||||
|       rate_limit: @options[:with_rate_limit], | ||||
|     }.compact | ||||
|   | ||||
| @@ -1,5 +1,7 @@ | ||||
| # frozen_string_literal: true | ||||
|  | ||||
| require 'csv' | ||||
|  | ||||
| class ImportValidator < ActiveModel::Validator | ||||
|   KNOWN_HEADERS = [ | ||||
|     'Account address', | ||||
|   | ||||
| @@ -23,7 +23,7 @@ | ||||
|       = f.input :setting_default_privacy, collection: Status.selectable_visibilities, wrapper: :with_label, include_blank: false, label_method: lambda { |visibility| safe_join([I18n.t("statuses.visibilities.#{visibility}"), I18n.t("statuses.visibilities.#{visibility}_long")], ' - ') }, required: false, hint: false | ||||
|  | ||||
|     .fields-group.fields-row__column.fields-row__column-6 | ||||
|       = f.input :setting_default_language, collection: [nil] + filterable_languages.sort, wrapper: :with_label, label_method: lambda { |locale| locale.nil? ? I18n.t('statuses.language_detection') : human_locale(locale) }, required: false, include_blank: false, hint: false | ||||
|       = f.input :setting_default_language, collection: [nil] + filterable_languages, wrapper: :with_label, label_method: lambda { |locale| locale.nil? ? I18n.t('statuses.default_language') : human_locale(locale) }, required: false, include_blank: false, hint: false | ||||
|  | ||||
|   .fields-group | ||||
|     = f.input :setting_default_sensitive, as: :boolean, wrapper: :with_label | ||||
| @@ -34,7 +34,7 @@ | ||||
|   %h4= t 'preferences.public_timelines' | ||||
|  | ||||
|   .fields-group | ||||
|     = f.input :chosen_languages, collection: filterable_languages.sort, wrapper: :with_block_label, include_blank: false, label_method: lambda { |locale| human_locale(locale) }, required: false, as: :check_boxes, collection_wrapper_tag: 'ul', item_wrapper_tag: 'li' | ||||
|     = f.input :chosen_languages, collection: filterable_languages, wrapper: :with_block_label, include_blank: false, label_method: lambda { |locale| human_locale(locale) }, required: false, as: :check_boxes, collection_wrapper_tag: 'ul', item_wrapper_tag: 'li' | ||||
|  | ||||
|   .actions | ||||
|     = f.button :button, t('generic.save_changes'), type: :submit | ||||
|   | ||||
| @@ -1307,13 +1307,13 @@ en: | ||||
|         other: "%{count} videos" | ||||
|     boosted_from_html: Boosted from %{acct_link} | ||||
|     content_warning: 'Content warning: %{warning}' | ||||
|     default_language: Same as interface language | ||||
|     disallowed_hashtags: | ||||
|       one: 'contained a disallowed hashtag: %{tags}' | ||||
|       other: 'contained the disallowed hashtags: %{tags}' | ||||
|     edited_at: Edited %{date} | ||||
|     errors: | ||||
|       in_reply_not_found: The post you are trying to reply to does not appear to exist. | ||||
|     language_detection: Automatically detect language | ||||
|     open_in_web: Open in web | ||||
|     over_character_limit: character limit of %{max} exceeded | ||||
|     pin_errors: | ||||
|   | ||||
| @@ -96,7 +96,8 @@ namespace :repo do | ||||
|     end.uniq.compact | ||||
|  | ||||
|     missing_available_locales = locales_in_files - I18n.available_locales | ||||
|     missing_locale_names = I18n.available_locales.reject { |locale| LanguagesHelper::HUMAN_LOCALES.key?(locale) } | ||||
|     supported_locale_codes    = Set.new(LanguagesHelper::SUPPORTED_LOCALES.keys + LanguagesHelper::REGIONAL_LOCALE_NAMES.keys) | ||||
|     missing_locale_names      = I18n.available_locales.reject { |locale| supported_locale_codes.include?(locale) } | ||||
|  | ||||
|     critical = false | ||||
|  | ||||
| @@ -123,7 +124,7 @@ namespace :repo do | ||||
|  | ||||
|     unless missing_locale_names.empty? | ||||
|       puts pastel.yellow("You are missing human-readable names for these locales: #{pastel.bold(missing_locale_names.join(', '))}") | ||||
|       puts pastel.yellow("Add them to #{pastel.bold('HUMAN_LOCALES')} in app/helpers/settings_helper.rb or remove the locales from #{pastel.bold('I18n.available_locales')} in config/application.rb") | ||||
|       puts pastel.yellow("Add them to app/helpers/languages_helper.rb or remove the locales from #{pastel.bold('I18n.available_locales')} in config/application.rb") | ||||
|     end | ||||
|  | ||||
|     if critical | ||||
|   | ||||
| @@ -3,9 +3,9 @@ | ||||
| require 'rails_helper' | ||||
|  | ||||
| describe LanguagesHelper do | ||||
|   describe 'the HUMAN_LOCALES constant' do | ||||
|     it 'includes all I18n locales' do | ||||
|       expect(described_class::HUMAN_LOCALES.keys).to include(*I18n.available_locales) | ||||
|   describe 'the SUPPORTED_LOCALES constant' do | ||||
|     it 'includes all i18n locales' do | ||||
|       expect(Set.new(described_class::SUPPORTED_LOCALES.keys + described_class::REGIONAL_LOCALE_NAMES.keys)).to include(*I18n.available_locales) | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   | ||||
| @@ -1,134 +0,0 @@ | ||||
| # frozen_string_literal: true | ||||
|  | ||||
| require 'rails_helper' | ||||
|  | ||||
| describe LanguageDetector do | ||||
|   describe 'prepare_text' do | ||||
|     it 'returns unmodified string without special cases' do | ||||
|       string = 'just a regular string' | ||||
|       result = described_class.instance.send(:prepare_text, string) | ||||
|  | ||||
|       expect(result).to eq string | ||||
|     end | ||||
|  | ||||
|     it 'collapses spacing in strings' do | ||||
|       string = 'The formatting   in    this is very        odd' | ||||
|  | ||||
|       result = described_class.instance.send(:prepare_text, string) | ||||
|       expect(result).to eq 'The formatting in this is very odd' | ||||
|     end | ||||
|  | ||||
|     it 'strips usernames from strings before detection' do | ||||
|       string = '@username Yeah, very surreal...! also @friend' | ||||
|  | ||||
|       result = described_class.instance.send(:prepare_text, string) | ||||
|       expect(result).to eq 'Yeah, very surreal...! also' | ||||
|     end | ||||
|  | ||||
|     it 'strips URLs from strings before detection' do | ||||
|       string = 'Our website is https://example.com and also http://localhost.dev' | ||||
|  | ||||
|       result = described_class.instance.send(:prepare_text, string) | ||||
|       expect(result).to eq 'Our website is and also' | ||||
|     end | ||||
|  | ||||
|     it 'converts #hashtags back to normal text before detection' do | ||||
|       string = 'Hey look at all the #animals and #FishAndChips' | ||||
|  | ||||
|       result = described_class.instance.send(:prepare_text, string) | ||||
|       expect(result).to eq 'Hey look at all the animals and fish and chips' | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   describe 'detect' do | ||||
|     let(:account_without_user_locale) { Fabricate(:user, locale: nil).account } | ||||
|     let(:account_remote) { Fabricate(:account, domain: 'joinmastodon.org') } | ||||
|  | ||||
|     it 'detects english language for basic strings' do | ||||
|       strings = [ | ||||
|         "Hello and welcome to mastodon how are you today?", | ||||
|         "I'd rather not!", | ||||
|         "a lot of people just want to feel righteous all the time and that's all that matters", | ||||
|       ] | ||||
|       strings.each do |string| | ||||
|         result = described_class.instance.detect(string, account_without_user_locale) | ||||
|  | ||||
|         expect(result).to eq(:en), string | ||||
|       end | ||||
|     end | ||||
|  | ||||
|     it 'detects spanish language' do | ||||
|       string = 'Obtener un Hola y bienvenidos a Mastodon. Obtener un Hola y bienvenidos a Mastodon. Obtener un Hola y bienvenidos a Mastodon. Obtener un Hola y bienvenidos a Mastodon' | ||||
|       result = described_class.instance.detect(string, account_without_user_locale) | ||||
|  | ||||
|       expect(result).to eq :es | ||||
|     end | ||||
|  | ||||
|     describe 'when language can\'t be detected' do | ||||
|       it 'uses nil when sent an empty document' do | ||||
|         result = described_class.instance.detect('', account_without_user_locale) | ||||
|         expect(result).to eq nil | ||||
|       end | ||||
|  | ||||
|       describe 'because of a URL' do | ||||
|         it 'uses nil when sent just a URL' do | ||||
|           string = 'http://example.com/media/2kFTgOJLXhQf0g2nKB4' | ||||
|           cld_result = CLD3::NNetLanguageIdentifier.new(0, 2048).find_language(string) | ||||
|           expect(cld_result).not_to eq :en | ||||
|  | ||||
|           result = described_class.instance.detect(string, account_without_user_locale) | ||||
|  | ||||
|           expect(result).to eq nil | ||||
|         end | ||||
|       end | ||||
|  | ||||
|       describe 'with an account' do | ||||
|         it 'uses the account locale when present' do | ||||
|           account = double(user_locale: 'fr') | ||||
|           result  = described_class.instance.detect('', account) | ||||
|  | ||||
|           expect(result).to eq nil | ||||
|         end | ||||
|  | ||||
|         it 'uses nil when account is present but has no locale' do | ||||
|           result = described_class.instance.detect('', account_without_user_locale) | ||||
|  | ||||
|           expect(result).to eq nil | ||||
|         end | ||||
|       end | ||||
|  | ||||
|       describe 'with an `en` default locale' do | ||||
|         it 'uses nil for undetectable string' do | ||||
|           result = described_class.instance.detect('', account_without_user_locale) | ||||
|  | ||||
|           expect(result).to eq nil | ||||
|         end | ||||
|       end | ||||
|  | ||||
|       describe 'remote user' do | ||||
|         it 'detects Korean language' do | ||||
|           string = '안녕하세요' | ||||
|           result = described_class.instance.detect(string, account_remote) | ||||
|  | ||||
|           expect(result).to eq :ko | ||||
|         end | ||||
|       end | ||||
|  | ||||
|       describe 'with a non-`en` default locale' do | ||||
|         around(:each) do |example| | ||||
|           before = I18n.default_locale | ||||
|           I18n.default_locale = :ja | ||||
|           example.run | ||||
|           I18n.default_locale = before | ||||
|         end | ||||
|  | ||||
|         it 'uses nil for undetectable string' do | ||||
|           string = '' | ||||
|           result = described_class.instance.detect(string, account_without_user_locale) | ||||
|  | ||||
|           expect(result).to eq nil | ||||
|         end | ||||
|       end | ||||
|     end | ||||
|   end | ||||
| end | ||||
		Reference in New Issue
	
	Block a user