mirror of
https://github.com/glitch-soc/mastodon.git
synced 2025-01-11 10:12:56 -05:00
Add more accurate hashtag search (#11579)
* Add more accurate hashtag search Using ElasticSearch to index hashtags with edge n-grams and score them by usage within the last 7 days since last activity. Only hashtags that have been reviewed and are listable can appear in searches, unless they match the query exactly * Fix search analyzer dropping non-ascii characters
This commit is contained in:
parent
3a77090d01
commit
cc0a55cf9a
37
app/chewy/tags_index.rb
Normal file
37
app/chewy/tags_index.rb
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
class TagsIndex < Chewy::Index
|
||||||
|
settings index: { refresh_interval: '15m' }, analysis: {
|
||||||
|
analyzer: {
|
||||||
|
content: {
|
||||||
|
tokenizer: 'keyword',
|
||||||
|
filter: %w(lowercase asciifolding cjk_width),
|
||||||
|
},
|
||||||
|
|
||||||
|
edge_ngram: {
|
||||||
|
tokenizer: 'edge_ngram',
|
||||||
|
filter: %w(lowercase asciifolding cjk_width),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
tokenizer: {
|
||||||
|
edge_ngram: {
|
||||||
|
type: 'edge_ngram',
|
||||||
|
min_gram: 2,
|
||||||
|
max_gram: 15,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
define_type ::Tag.listable, delete_if: ->(tag) { tag.destroyed? || !tag.listable? } do
|
||||||
|
root date_detection: false do
|
||||||
|
field :name, type: 'text', analyzer: 'content' do
|
||||||
|
field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
|
||||||
|
end
|
||||||
|
|
||||||
|
field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? }
|
||||||
|
field :usage, type: 'long', value: ->(tag) { tag.history.reduce(0) { |total, day| total + day[:accounts].to_i } }
|
||||||
|
field :last_status_at, type: 'date', value: ->(tag) { tag.last_status_at || tag.created_at }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
@ -13,6 +13,8 @@
|
|||||||
# listable :boolean
|
# listable :boolean
|
||||||
# reviewed_at :datetime
|
# reviewed_at :datetime
|
||||||
# requested_review_at :datetime
|
# requested_review_at :datetime
|
||||||
|
# last_status_at :datetime
|
||||||
|
# last_trend_at :datetime
|
||||||
#
|
#
|
||||||
|
|
||||||
class Tag < ApplicationRecord
|
class Tag < ApplicationRecord
|
||||||
@ -33,7 +35,8 @@ class Tag < ApplicationRecord
|
|||||||
scope :unreviewed, -> { where(reviewed_at: nil) }
|
scope :unreviewed, -> { where(reviewed_at: nil) }
|
||||||
scope :pending_review, -> { unreviewed.where.not(requested_review_at: nil) }
|
scope :pending_review, -> { unreviewed.where.not(requested_review_at: nil) }
|
||||||
scope :usable, -> { where(usable: [true, nil]) }
|
scope :usable, -> { where(usable: [true, nil]) }
|
||||||
scope :discoverable, -> { where(listable: [true, nil]).joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
|
scope :listable, -> { where(listable: [true, nil]) }
|
||||||
|
scope :discoverable, -> { listable.joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
|
||||||
scope :most_used, ->(account) { joins(:statuses).where(statuses: { account: account }).group(:id).order(Arel.sql('count(*) desc')) }
|
scope :most_used, ->(account) { joins(:statuses).where(statuses: { account: account }).group(:id).order(Arel.sql('count(*) desc')) }
|
||||||
|
|
||||||
delegate :accounts_count,
|
delegate :accounts_count,
|
||||||
@ -44,6 +47,8 @@ class Tag < ApplicationRecord
|
|||||||
|
|
||||||
after_save :save_account_tag_stat
|
after_save :save_account_tag_stat
|
||||||
|
|
||||||
|
update_index('tags#tag', :self) if Chewy.enabled?
|
||||||
|
|
||||||
def account_tag_stat
|
def account_tag_stat
|
||||||
super || build_account_tag_stat
|
super || build_account_tag_stat
|
||||||
end
|
end
|
||||||
@ -121,9 +126,10 @@ class Tag < ApplicationRecord
|
|||||||
normalized_term = normalize(term.strip).mb_chars.downcase.to_s
|
normalized_term = normalize(term.strip).mb_chars.downcase.to_s
|
||||||
pattern = sanitize_sql_like(normalized_term) + '%'
|
pattern = sanitize_sql_like(normalized_term) + '%'
|
||||||
|
|
||||||
Tag.where(arel_table[:name].lower.matches(pattern))
|
Tag.listable
|
||||||
.where(arel_table[:score].gt(0).or(arel_table[:name].lower.eq(normalized_term)))
|
.where(arel_table[:name].lower.matches(pattern))
|
||||||
.order(Arel.sql('length(name) ASC, score DESC, name ASC'))
|
.where(arel_table[:name].lower.eq(normalized_term).or(arel_table[:reviewed_at].not_eq(nil)))
|
||||||
|
.order(Arel.sql('length(name) ASC, name ASC'))
|
||||||
.limit(limit)
|
.limit(limit)
|
||||||
.offset(offset)
|
.offset(offset)
|
||||||
end
|
end
|
||||||
|
@ -17,6 +17,9 @@ class TrendingTags
|
|||||||
increment_historical_use!(tag.id, at_time)
|
increment_historical_use!(tag.id, at_time)
|
||||||
increment_unique_use!(tag.id, account.id, at_time)
|
increment_unique_use!(tag.id, account.id, at_time)
|
||||||
increment_vote!(tag, at_time)
|
increment_vote!(tag, at_time)
|
||||||
|
|
||||||
|
tag.update(last_status_at: Time.now.utc) if tag.last_status_at.nil? || tag.last_status_at < 12.hours.ago
|
||||||
|
tag.update(last_trend_at: Time.now.utc) if trending?(tag) && (tag.last_trend_at.nil? || tag.last_trend_at < 12.hours.ago)
|
||||||
end
|
end
|
||||||
|
|
||||||
def get(limit, filtered: true)
|
def get(limit, filtered: true)
|
||||||
|
@ -109,7 +109,7 @@ class AccountSearchService < BaseService
|
|||||||
field_value_factor: {
|
field_value_factor: {
|
||||||
field: 'followers_count',
|
field: 'followers_count',
|
||||||
modifier: 'log2p',
|
modifier: 'log2p',
|
||||||
missing: 1,
|
missing: 0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
@ -57,10 +57,10 @@ class SearchService < BaseService
|
|||||||
end
|
end
|
||||||
|
|
||||||
def perform_hashtags_search!
|
def perform_hashtags_search!
|
||||||
Tag.search_for(
|
TagSearchService.new.call(
|
||||||
@query.gsub(/\A#/, ''),
|
@query,
|
||||||
@limit,
|
limit: @limit,
|
||||||
@offset
|
offset: @offset
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
82
app/services/tag_search_service.rb
Normal file
82
app/services/tag_search_service.rb
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
class TagSearchService < BaseService
|
||||||
|
def call(query, options = {})
|
||||||
|
@query = query.strip.gsub(/\A#/, '')
|
||||||
|
@offset = options[:offset].to_i
|
||||||
|
@limit = options[:limit].to_i
|
||||||
|
|
||||||
|
if Chewy.enabled?
|
||||||
|
from_elasticsearch
|
||||||
|
else
|
||||||
|
from_database
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def from_elasticsearch
|
||||||
|
query = {
|
||||||
|
function_score: {
|
||||||
|
query: {
|
||||||
|
multi_match: {
|
||||||
|
query: @query,
|
||||||
|
fields: %w(name.edge_ngram name),
|
||||||
|
type: 'most_fields',
|
||||||
|
operator: 'and',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
functions: [
|
||||||
|
{
|
||||||
|
field_value_factor: {
|
||||||
|
field: 'usage',
|
||||||
|
modifier: 'log2p',
|
||||||
|
missing: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
gauss: {
|
||||||
|
last_status_at: {
|
||||||
|
scale: '7d',
|
||||||
|
offset: '14d',
|
||||||
|
decay: 0.5,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
|
||||||
|
boost_mode: 'multiply',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
filter = {
|
||||||
|
bool: {
|
||||||
|
should: [
|
||||||
|
{
|
||||||
|
term: {
|
||||||
|
reviewed: {
|
||||||
|
value: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
term: {
|
||||||
|
name: {
|
||||||
|
value: @query,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
TagsIndex.query(query).filter(filter).limit(@limit).offset(@offset).objects.compact
|
||||||
|
end
|
||||||
|
|
||||||
|
def from_database
|
||||||
|
Tag.search_for(@query, @limit, @offset)
|
||||||
|
end
|
||||||
|
end
|
@ -142,7 +142,7 @@ en:
|
|||||||
report: Send e-mail when a new report is submitted
|
report: Send e-mail when a new report is submitted
|
||||||
trending_tag: Send e-mail when an unreviewed hashtag is trending
|
trending_tag: Send e-mail when an unreviewed hashtag is trending
|
||||||
tag:
|
tag:
|
||||||
listable: Allow this hashtag to appear on the profile directory
|
listable: Allow this hashtag to appear in searches and on the profile directory
|
||||||
trendable: Allow this hashtag to appear under trends
|
trendable: Allow this hashtag to appear under trends
|
||||||
usable: Allow toots to use this hashtag
|
usable: Allow toots to use this hashtag
|
||||||
'no': 'No'
|
'no': 'No'
|
||||||
|
6
db/migrate/20190815225426_add_last_status_at_to_tags.rb
Normal file
6
db/migrate/20190815225426_add_last_status_at_to_tags.rb
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
class AddLastStatusAtToTags < ActiveRecord::Migration[5.2]
|
||||||
|
def change
|
||||||
|
add_column :tags, :last_status_at, :datetime
|
||||||
|
add_column :tags, :last_trend_at, :datetime
|
||||||
|
end
|
||||||
|
end
|
@ -10,7 +10,7 @@
|
|||||||
#
|
#
|
||||||
# It's strongly recommended that you check this file into your version control system.
|
# It's strongly recommended that you check this file into your version control system.
|
||||||
|
|
||||||
ActiveRecord::Schema.define(version: 2019_08_07_135426) do
|
ActiveRecord::Schema.define(version: 2019_08_15_225426) do
|
||||||
|
|
||||||
# These are extensions that must be enabled in order to support this database
|
# These are extensions that must be enabled in order to support this database
|
||||||
enable_extension "plpgsql"
|
enable_extension "plpgsql"
|
||||||
@ -667,6 +667,8 @@ ActiveRecord::Schema.define(version: 2019_08_07_135426) do
|
|||||||
t.boolean "listable"
|
t.boolean "listable"
|
||||||
t.datetime "reviewed_at"
|
t.datetime "reviewed_at"
|
||||||
t.datetime "requested_review_at"
|
t.datetime "requested_review_at"
|
||||||
|
t.datetime "last_status_at"
|
||||||
|
t.datetime "last_trend_at"
|
||||||
t.index "lower((name)::text)", name: "index_tags_on_name_lower", unique: true
|
t.index "lower((name)::text)", name: "index_tags_on_name_lower", unique: true
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -136,8 +136,8 @@ RSpec.describe Tag, type: :model do
|
|||||||
end
|
end
|
||||||
|
|
||||||
it 'finds the exact matching tag as the first item' do
|
it 'finds the exact matching tag as the first item' do
|
||||||
similar_tag = Fabricate(:tag, name: "matchlater", score: 1)
|
similar_tag = Fabricate(:tag, name: "matchlater", reviewed_at: Time.now.utc)
|
||||||
tag = Fabricate(:tag, name: "match", score: 1)
|
tag = Fabricate(:tag, name: "match", reviewed_at: Time.now.utc)
|
||||||
|
|
||||||
results = Tag.search_for("match")
|
results = Tag.search_for("match")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user