package com.distributed.search.logic;

import java.nio.charset.CharacterCodingException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Handles the mathematical part of the TF-IDF algorithm.
 * Optimized for performance and cross-language support.
 */

public class TFIDFCalculator {

    /**
     * Reads a file and extracts words (tokens).
     * Uses ISO_8859_1 encoding to support various legacy text formats.
     */
    public static List<String> getWordsFromDocument(Path filePath) {
        try {
            String content = Files.readString(filePath, StandardCharsets.ISO_8859_1);

            content = content.toLowerCase(Locale.ROOT);

            // Regex: keep only letters and numbers (Unicode supported)
            String cleaned = content.replaceAll("[^\\p{L}\\p{N}]+", " ");

            String[] tokens = cleaned.trim().split("\\s+");

            return Arrays.stream(tokens)
                    .filter(s -> !s.isEmpty())
                    .collect(Collectors.toList());

        } catch (CharacterCodingException e) {
            System.err.println("Character encoding error reading file: " + filePath);
            return Collections.emptyList();
        } catch (IOException e) {
            System.err.println("Error reading file: " + filePath + " - " + e.getMessage());
            return Collections.emptyList();
        } catch (Exception e) {
            System.err.println("Unexpected error processing file: " + filePath);
            e.printStackTrace();
            return Collections.emptyList();
        }
    }

    /**
     * Calculates Term Frequency (TF): (Count of term in doc) / (Total words in doc)
     */
    public static double calculateTermFrequency(List<String> words, String term) {
        if (words == null || words.isEmpty()) return 0.0;

        long count = words.stream()
                .filter(word -> word.equalsIgnoreCase(term))
                .count();

        return (double) count / words.size();
    }

    /**
     * Calculates Inverse Document Frequency (IDF): log10(Total Docs / Docs containing term)
     */
    public static double calculateIdf(int totalDocuments, int documentsWithTerm) {
        if (documentsWithTerm <= 0 || totalDocuments <= 0) return 0.0;
        return Math.log10((double) totalDocuments / documentsWithTerm);
    }
}
