FEATURE: link correctly to filters to assist in debugging spam (#1031)

- Add spam_score_type to AiSpamSerializer for better integration with reviewables. - Introduce a custom filter for detecting AI spam false negatives in moderation workflows. - Refactor spam report generation to improve identification of false negatives. - Add tests to verify the custom filter and its behavior. - Introduce links for all spam counts in report
discourse · Dec 17, 2024 · fae2d5f · fae2d5f
1 parent 90ce942
commit fae2d5f
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 28 deletions.
diff --git a/app/serializers/ai_spam_serializer.rb b/app/serializers/ai_spam_serializer.rb
@@ -1,7 +1,13 @@
 # frozen_string_literal: true
 
 class AiSpamSerializer < ApplicationSerializer
-  attributes :is_enabled, :llm_id, :custom_instructions, :available_llms, :stats, :flagging_username
+  attributes :is_enabled,
+             :llm_id,
+             :custom_instructions,
+             :available_llms,
+             :stats,
+             :flagging_username,
+             :spam_score_type
 
   def is_enabled
     object[:enabled]
@@ -25,6 +31,10 @@ def flagging_username
     object[:flagging_username]
   end
 
+  def spam_score_type
+    ReviewableScore.types[:spam]
+  end
+
   def stats
     {
       scanned_count: object[:stats].scanned_count.to_i,

diff --git a/assets/javascripts/discourse/components/ai-spam.gjs b/assets/javascripts/discourse/components/ai-spam.gjs
@@ -125,9 +125,30 @@ export default class AiSpam extends Component {
       label: i18n("discourse_ai.spam.spam_detected"),
       value: this.stats.spam_detected,
     };
+
+    const falsePositives = {
+      label: i18n("discourse_ai.spam.false_positives"),
+      value: this.stats.false_positives,
+      tooltip: i18n("discourse_ai.spam.stat_tooltips.incorrectly_flagged"),
+    };
+
+    const falseNegatives = {
+      label: i18n("discourse_ai.spam.false_negatives"),
+      value: this.stats.false_negatives,
+      tooltip: i18n("discourse_ai.spam.stat_tooltips.missed_spam"),
+    };
+
     if (this.args.model.flagging_username) {
       detected.href = getURL(
-        "/review?flagged_by=" + this.args.model.flagging_username
+        `/review?flagged_by=${this.args.model.flagging_username}&status=all&sort_order=created_at`
+      );
+
+      falsePositives.href = getURL(
+        `/review?flagged_by=${this.args.model.flagging_username}&status=rejected&sort_order=created_at`
+      );
+
+      falseNegatives.href = getURL(
+        `/review?status=approved&sort_order=created_at&additional_filters={"ai_spam_false_negative":true}&order=created&score_type=${this.args.model.spam_score_type}`
       );
     }
     return [
@@ -136,16 +157,8 @@ export default class AiSpam extends Component {
         value: this.stats.scanned_count,
       },
       detected,
-      {
-        label: i18n("discourse_ai.spam.false_positives"),
-        value: this.stats.false_positives,
-        tooltip: i18n("discourse_ai.spam.stat_tooltips.incorrectly_flagged"),
-      },
-      {
-        label: i18n("discourse_ai.spam.false_negatives"),
-        value: this.stats.false_negatives,
-        tooltip: i18n("discourse_ai.spam.stat_tooltips.missed_spam"),
-      },
+      falsePositives,
+      falseNegatives,
     ];
   }
 

diff --git a/lib/ai_moderation/entry_point.rb b/lib/ai_moderation/entry_point.rb
@@ -11,6 +11,25 @@ def inject_into(plugin)
         plugin.on(:site_setting_changed) do |name, _old_value, new_value|
           SpamScanner.ensure_flagging_user! if name == :ai_spam_detection_enabled && new_value
         end
+
+        custom_filter = [
+          :ai_spam_false_negative,
+          Proc.new do |results, value|
+            if value
+              results.where(<<~SQL)
+              EXISTS (
+                SELECT 1 FROM ai_spam_logs
+                WHERE NOT is_spam
+                AND post_id = target_id AND target_type = 'Post'
+              )
+            SQL
+            else
+              results
+            end
+          end,
+        ]
+
+        Reviewable.add_custom_filter(custom_filter)
       end
     end
   end

diff --git a/lib/ai_moderation/spam_report.rb b/lib/ai_moderation/spam_report.rb
@@ -14,33 +14,34 @@ def self.generate(min_date: 1.week.ago)
               asl.post_id,
               asl.is_spam,
               r.status as reviewable_status,
-              r.target_type,
-              r.potential_spam
+              CASE WHEN EXISTS (
+                SELECT 1 FROM reviewable_scores rs
+                JOIN reviewables r1 ON r1.id = rs.reviewable_id
+                WHERE r1.target_id = asl.post_id
+                AND r1.target_type = 'Post'
+                AND rs.reviewable_score_type = :spam_score_type
+                AND NOT is_spam
+                AND r1.status IN (:spam)
+              ) THEN true ELSE false END AS missed_spam
             FROM ai_spam_logs asl
             LEFT JOIN reviewables r ON r.id = asl.reviewable_id
             WHERE asl.created_at > :min_date
-          ),
-          post_reviewables AS (
-            SELECT
-              target_id post_id,
-              COUNT(DISTINCT target_id) as false_negative_count
-            FROM reviewables
-            WHERE target_type = 'Post'
-              AND status IN (:spam)
-              AND potential_spam
-              AND target_id IN (SELECT post_id FROM spam_stats)
-            GROUP BY target_id
           )
           SELECT
             COUNT(*) AS scanned_count,
             SUM(CASE WHEN is_spam THEN 1 ELSE 0 END) AS spam_detected,
             COUNT(CASE WHEN reviewable_status IN (:ham) THEN 1 END) AS false_positives,
-            COALESCE(SUM(pr.false_negative_count), 0) AS false_negatives
+            COUNT(CASE WHEN missed_spam THEN 1 END) AS false_negatives
           FROM spam_stats
-          LEFT JOIN post_reviewables pr USING (post_id)
         SQL
 
-        DB.query(sql, spam: spam_status, ham: ham_status, min_date: min_date).first
+        DB.query(
+          sql,
+          spam: spam_status,
+          ham: ham_status,
+          min_date: min_date,
+          spam_score_type: ReviewableScore.types[:spam],
+        ).first
       end
     end
   end

diff --git a/spec/requests/admin/reviewable_controller_spec.rb b/spec/requests/admin/reviewable_controller_spec.rb
@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+
+RSpec.describe ReviewablesController do
+  fab!(:post1) { Fabricate(:post) }
+  fab!(:post2) { Fabricate(:post) }
+  fab!(:admin)
+  fab!(:llm_model)
+
+  fab!(:reviewable) do
+    Reviewable.create!(
+      target: post1,
+      topic: post2.topic,
+      type: ReviewablePost,
+      created_by: admin,
+      status: Reviewable.statuses[:pending],
+    )
+  end
+
+  fab!(:reviewable2) do
+    Reviewable.create!(
+      target: post2,
+      topic: post2.topic,
+      type: ReviewablePost,
+      created_by: admin,
+      status: Reviewable.statuses[:pending],
+    )
+  end
+
+  fab!(:ai_spam_log_missed) do
+    AiSpamLog.create!(is_spam: false, post_id: post1.id, llm_model_id: llm_model.id)
+  end
+  # we amend the behavior with a custom filter so we need to confirm it works
+  it "properly applies custom filter" do
+    sign_in(admin)
+
+    get '/review.json?additional_filters={"ai_spam_false_negative":true}'
+    expect(response.status).to eq(200)
+
+    json = JSON.parse(response.body)
+    expect(json["reviewables"].length).to eq(1)
+
+    get "/review.json"
+    expect(response.status).to eq(200)
+    json = JSON.parse(response.body)
+    expect(json["reviewables"].length).to eq(2)
+  end
+end