// @btrway/global-search/src/utils/similarity.ts
import { GlobalSearchResponse } from '@btrway/api-core';
import { DifferenceMetrics, TopResultsOptions } from '../types/results';

// Default thresholds for determining significant drops in similarity
export const DEFAULT_SIMILARITY_THRESHOLDS = {
  /**
   * Number of standard deviations a drop must exceed to be considered significant
   * Higher values mean larger statistical outliers are required
   */
  STANDARD_DEVIATION: 2.5,

  /**
   * Percentage drop required between consecutive results to be considered significant
   * e.g., 45 means a 45% drop from previous value is needed to trigger cutoff
   */
  PERCENTAGE_DROP: 45,

  /**
   * Absolute difference in percentage points required between consecutive results
   * e.g., 25 means a drop of 25 percentage points (like 75% to 50%)
   */
  ABSOLUTE_DROP: 25,

  /**
   * Required size of the gap between groups of similar scores
   * Used to detect "clusters" of similar scores vs isolated drops
   */
  CLUSTER_GAP: 20,
} as const;

/**
 * Calculates statistical metrics about differences between consecutive similarity scores
 */
export function calculateSimilarityDifferences(
  results: GlobalSearchResponse[]
): DifferenceMetrics[] {
  const differences: DifferenceMetrics[] = [];

  for (let i = 1; i < results.length; i++) {
    const current = results[i].similarity;
    const previous = results[i - 1].similarity;
    const diff = previous - current;

    differences.push({
      index: i,
      diff,
      percentDrop: (diff / previous) * 100,
      absoluteDiff: diff * 100, // Convert to percentage points
    });
  }

  return differences;
}

/**
 * Calculates mean and standard deviation for a set of differences
 */
export function calculateStatistics(differences: DifferenceMetrics[]) {
  const mean =
    differences.reduce((sum, d) => sum + d.diff, 0) / differences.length;
  const variance =
    differences.reduce((sum, d) => sum + Math.pow(d.diff - mean, 2), 0) /
    differences.length;
  const stdDev = Math.sqrt(variance);

  return { mean, stdDev };
}

/**
 * Filters search results to only include top results before any significant drops in relevance
 */
export function filterTopResults(
  results: GlobalSearchResponse[],
  options: TopResultsOptions = {}
): GlobalSearchResponse[] {
  if (results.length <= 1) {
    return results;
  }

  // Sort by similarity in descending order
  const sortedResults = [...results].sort(
    (a, b) => b.similarity - a.similarity
  );

  // Calculate differences between consecutive scores
  const differences = calculateSimilarityDifferences(sortedResults);

  if (differences.length === 0) {
    return sortedResults;
  }

  const { mean, stdDev } = calculateStatistics(differences);

  // Find clusters of similar scores
  let currentClusterEnd = 0;
  for (let i = 1; i < sortedResults.length; i++) {
    const current = sortedResults[i].similarity;
    const previous = sortedResults[i - 1].similarity;
    const dropSize = (previous - current) * 100; // Convert to percentage points

    // Check if this drop represents a significant gap between clusters
    if (dropSize >= DEFAULT_SIMILARITY_THRESHOLDS.CLUSTER_GAP) {
      // Only cut off if:
      // 1. The first cluster has significantly higher scores (to avoid cutting off when all scores are low)
      // 2. There's a clear gap between clusters
      // 3. The first score in the next cluster is significantly lower
      const firstClusterAvg =
        sortedResults.slice(0, i).reduce((sum, r) => sum + r.similarity, 0) / i;

      const nextClusterAvg =
        sortedResults.slice(i).reduce((sum, r) => sum + r.similarity, 0) /
        (sortedResults.length - i);

      if (
        firstClusterAvg > 0.6 && // First cluster has high scores
        (firstClusterAvg - nextClusterAvg) * 100 >=
          DEFAULT_SIMILARITY_THRESHOLDS.CLUSTER_GAP
      ) {
        return sortedResults.slice(0, i);
      }
    }
  }

  return sortedResults;
}
