clm-system/clm-system/tests/test_ingestion.py

"""
Comprehensive tests for document ingestion module using pytest
"""

import os
from pathlib import Path
from unittest.mock import MagicMock
from unittest.mock import patch

import pytest

from clm_system.ingestion import DocumentProcessor
from clm_system.ingestion import ProcessingResult


class TestProcessingResult:
    """Test ProcessingResult dataclass"""

    def test_processing_result_creation(self):
        """Test ProcessingResult creation with all fields"""
        result = ProcessingResult(
            success=True,
            document_id="test.pdf",
            error=None,
            metadata={"chunks": 5, "file_size": 1024}
        )

        assert result.success is True
        assert result.document_id == "test.pdf"
        assert result.error is None
        assert result.metadata is not None
        assert result.metadata["chunks"] == 5
        assert result.metadata["file_size"] == 1024

    def test_processing_result_minimal(self):
        """Test ProcessingResult creation with minimal fields"""
        result = ProcessingResult(success=False)

        assert result.success is False
        assert result.document_id is None
        assert result.error is None
        assert result.metadata is None

    def test_processing_result_with_error(self):
        """Test ProcessingResult with error message"""
        result = ProcessingResult(
            success=False,
            error="File not found"
        )

        assert result.success is False
        assert result.error == "File not found"


class TestDocumentProcessor:
    """Test DocumentProcessor class"""

    def test_initialization(self, temp_dir: Path):
        """Test processor initialization"""
        processor = DocumentProcessor(data_dir=str(temp_dir))

        assert processor is not None
        assert processor.data_dir == str(temp_dir)
        assert processor.db_path == os.path.join(temp_dir, "lancedb")
        assert processor.text_splitter is not None

    @patch('clm_system.ingestion.OpenAIEmbeddings')
    def test_embeddings_initialization_success(self, mock_embeddings_class, temp_dir: str):
        """Test successful embeddings initialization"""
        mock_embeddings = MagicMock()
        mock_embeddings_class.return_value = mock_embeddings

        processor = DocumentProcessor(data_dir=str(temp_dir))
        assert processor.embeddings is not None

    @patch('clm_system.ingestion.OpenAIEmbeddings')
    @patch('clm_system.ingestion.HuggingFaceEmbeddings')
    @patch('clm_system.ingestion.GoogleGenerativeAIEmbeddings')
    @patch('clm_system.ingestion.config')
    def test_embeddings_initialization_failure(self, mock_config, mock_google_embeddings_class, mock_hf_embeddings_class, mock_openai_embeddings_class, temp_dir: str):
        """Test embeddings initialization failure handling"""
        # Mock config to use openai model
        mock_config.EMBEDDING_MODEL = "openai"
        mock_config.OPENAI_EMBEDDING_MODEL = "text-embedding-ada-002"

        # Mock all embeddings to fail
        mock_openai_embeddings_class.side_effect = Exception("API key not found")
        mock_hf_embeddings_class.side_effect = Exception("API key not found")
        mock_google_embeddings_class.side_effect = Exception("API key not found")

        processor = DocumentProcessor(data_dir=str(temp_dir))
        assert processor.embeddings is None

    def test_process_uploads_success(self, processor_with_mock_embeddings: DocumentProcessor):
        """Test processing multiple uploaded files successfully"""
        # Create mock uploaded files
        uploaded_files = []
        for i in range(3):
            mock_file = MagicMock()
            mock_file.name = f"test_{i}.txt"
            mock_file.getbuffer.return_value = b"Test content"
            uploaded_files.append(mock_file)

        # Mock process_single_file to return success
        processor_with_mock_embeddings.process_single_file = MagicMock(return_value=ProcessingResult(
            success=True,
            document_id="test.txt"
        ))

        result = processor_with_mock_embeddings.process_uploads(uploaded_files)

        assert result["success"] is True
        assert result["count"] == 3
        assert len(result["results"]) == 3

    def test_process_uploads_with_failure(self, processor_with_mock_embeddings: DocumentProcessor):
        """Test processing uploaded files with some failures"""
        uploaded_files = []
        for i in range(2):
            mock_file = MagicMock()
            mock_file.name = f"test_{i}.txt"
            mock_file.getbuffer.return_value = b"Test content"
            uploaded_files.append(mock_file)

        # Mock process_single_file to return mixed results
        processor_with_mock_embeddings.process_single_file = MagicMock(side_effect=[
            ProcessingResult(success=True, document_id="test_0.txt"),
            ProcessingResult(success=False, error="Processing failed")
        ])

        result = processor_with_mock_embeddings.process_uploads(uploaded_files)

        assert result["success"] is True  # At least one success
        assert result["count"] == 1
        assert len(result["results"]) == 2

    def test_process_single_file_success(self, processor_with_mock_embeddings: DocumentProcessor):
        """Test processing a single uploaded file successfully"""
        mock_file = MagicMock()
        mock_file.name = "test.txt"
        mock_file.getbuffer.return_value = b"Test content"

        # Mock process_file to return success
        processor_with_mock_embeddings.process_file = MagicMock(return_value=ProcessingResult(
            success=True,
            document_id="test.txt"
        ))

        result = processor_with_mock_embeddings.process_single_file(mock_file)

        assert result.success is True
        assert result.document_id == "test.txt"

    def test_process_single_file_error(self, processor_with_mock_embeddings: DocumentProcessor):
        """Test processing a single uploaded file with error"""
        mock_file = MagicMock()
        mock_file.name = "test.txt"
        mock_file.getbuffer = MagicMock(side_effect=Exception("File read error"))

        result = processor_with_mock_embeddings.process_single_file(mock_file)

        assert result.success is False
        assert result.error is not None
        assert "File read error" in result.error

    def test_process_file_empty_content(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test processing file with empty content"""
        file_path = temp_dir / "empty.txt"
        file_path.write_text("")

        result = processor_with_mock_embeddings.process_file(str(file_path))

        assert result.success is False
        assert result.error is not None
        assert "No text content found" in result.error

    def test_extract_text_pdf(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test PDF text extraction"""
        # Create a test PDF file
        pdf_path = temp_dir / "test.pdf"
        pdf_path.write_text("dummy pdf content")  # Create dummy file

        # Mock PyPDF2 to return test content
        with patch('PyPDF2.PdfReader') as mock_pdf_reader:
            mock_page = MagicMock()
            mock_page.extract_text.return_value = "PDF test content"
            mock_pdf_reader.return_value.pages = [mock_page]

            text = processor_with_mock_embeddings.extract_pdf_text(str(pdf_path))

            assert text == "PDF test content\n"

    def test_extract_text_pdf_with_ocr_fallback(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test PDF text extraction with OCR fallback"""
        pdf_path = temp_dir / "test.pdf"
        # Mock PyPDF2 to raise exception, then OCR to return content
        with patch('PyPDF2.PdfReader') as mock_pdf_reader, \
             patch.object(processor_with_mock_embeddings, 'ocr_pdf') as mock_ocr:

            mock_pdf_reader.side_effect = Exception("PDF read error")
            mock_ocr.return_value = "OCR content"

            text = processor_with_mock_embeddings.extract_pdf_text(str(pdf_path))

            assert text == "OCR content"

    def test_extract_text_docx(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test DOCX text extraction"""
        docx_path = temp_dir / "test.docx"

        # Mock python-docx at the module level
        with patch('clm_system.ingestion.Document') as mock_document:
            mock_doc = MagicMock()
            mock_doc.paragraphs = [MagicMock(text="DOCX test content")]
            mock_document.return_value = mock_doc

            text = processor_with_mock_embeddings.extract_docx_text(str(docx_path))

            assert text == "DOCX test content"

    def test_extract_text_txt(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test TXT text extraction"""
        txt_path = temp_dir / "test.txt"
        txt_path.write_text("TXT test content")

        text = processor_with_mock_embeddings.extract_txt_text(str(txt_path))

        assert text == "TXT test content"

    def test_extract_text_unsupported_type(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test extraction of unsupported file type"""
        unsupported_path = temp_dir / "test.jpg"
        unsupported_path.write_text("fake image content")

        with pytest.raises(ValueError) as exc_info:
            processor_with_mock_embeddings.extract_text(str(unsupported_path))

        assert "Unsupported file type" in str(exc_info.value)

    def test_ocr_pdf_placeholder(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test OCR PDF placeholder implementation"""
        pdf_path = temp_dir / "test.pdf"
        text = processor_with_mock_embeddings.ocr_pdf(str(pdf_path))
        assert text == ""  # Placeholder returns empty string

    def test_store_documents_success(self, processor_with_mock_embeddings: DocumentProcessor, sample_documents: list):
        """Test successful document storage in LanceDB"""
        # Mock LanceDB operations
        with patch.object(processor_with_mock_embeddings.db, 'table_names', return_value=[]):
            with patch.object(processor_with_mock_embeddings.db, 'create_table') as mock_create:
                mock_table = MagicMock()
                mock_create.return_value = mock_table

                result = processor_with_mock_embeddings.store_documents(sample_documents)
                assert result is True
                mock_create.assert_called_once()

    def test_store_documents_no_embeddings(self, processor_with_mock_embeddings: DocumentProcessor, sample_documents: list):
        """Test document storage when embeddings are not available"""
        processor_with_mock_embeddings.embeddings = None

        result = processor_with_mock_embeddings.store_documents(sample_documents)
        assert result is False

    def test_store_documents_exception(self, temp_dir: Path, sample_documents: list):
        """Test document storage with exception"""
        processor = DocumentProcessor(data_dir=str(temp_dir))
        processor.embeddings = MagicMock()
        processor.embeddings.embed_documents.side_effect = Exception("Embedding error")

        result = processor.store_documents(sample_documents)
        assert result is False

    def test_get_table_exists(self, processor_with_mock_embeddings: DocumentProcessor):
        """Test getting existing table"""
        with patch.object(processor_with_mock_embeddings.db, 'table_names', return_value=['contracts']):
            with patch.object(processor_with_mock_embeddings.db, 'open_table') as mock_open:
                mock_table = MagicMock()
                mock_open.return_value = mock_table

                table = processor_with_mock_embeddings.get_table('contracts')
                assert table == mock_table

    def test_get_table_not_exists(self, processor_with_mock_embeddings: DocumentProcessor):
        """Test getting non-existent table"""
        with patch.object(processor_with_mock_embeddings.db, 'table_names', return_value=[]):
            table = processor_with_mock_embeddings.get_table('nonexistent')
            assert table is None


class TestDocumentProcessorEdgeCases:
    """Test edge cases and error conditions"""

    def test_process_file_nonexistent(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test processing non-existent file"""
        nonexistent_path = temp_dir / "nonexistent.pdf"

        result = processor_with_mock_embeddings.process_file(str(nonexistent_path))

        assert result.success is False
        assert result.error is not None

    def test_extract_pdf_text_exception(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test PDF text extraction with exception"""
        pdf_path = temp_dir / "test.pdf"
        # Mock PyPDF2 to raise exception
        with patch('PyPDF2.PdfReader') as mock_pdf_reader:
            mock_pdf_reader.side_effect = Exception("PDF read error")

            text = processor_with_mock_embeddings.extract_pdf_text(str(pdf_path))

            assert text == ""  # Should return empty string on error

    def test_extract_docx_text_exception(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test DOCX text extraction with exception"""
        docx_path = temp_dir / "test.docx"

        with patch('clm_system.ingestion.Document') as mock_document:
            mock_document.side_effect = ValueError("DOCX corrupted")

            with pytest.raises(ValueError):
                processor_with_mock_embeddings.extract_docx_text(str(docx_path))

    def test_extract_txt_text_exception(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
        """Test TXT text extraction with exception"""
        txt_path = temp_dir / "test.txt"
        txt_path.write_text("content")

        with patch('builtins.open') as mock_open:
            mock_open.side_effect = PermissionError("Permission denied")

            with pytest.raises(PermissionError):
                processor_with_mock_embeddings.extract_txt_text(str(txt_path))


# Parametrized tests for different file types
@pytest.mark.parametrize("file_extension,file_content,expected_method", [
    (".pdf", b"PDF content", "extract_pdf_text"),
    (".docx", b"DOCX content", "extract_docx_text"),
    (".txt", "TXT content", "extract_txt_text"),
])
def test_extract_text_by_type(temp_dir: Path, file_extension: str, file_content, expected_method: str):
    """Test text extraction for different file types"""
    processor = DocumentProcessor(data_dir=str(temp_dir))
    file_path = temp_dir / f"test{file_extension}"

    # Create file
    if isinstance(file_content, str):
        file_path.write_text(file_content)
    else:
        file_path.write_bytes(file_content)

    # Mock the specific extraction method
    with patch.object(processor, expected_method) as mock_method:
        mock_method.return_value = f"Extracted {file_extension} content"

        result = processor.extract_text(str(file_path))
        assert result == f"Extracted {file_extension} content"
        mock_method.assert_called_once_with(str(file_path))


# Test for file processing pipeline
def test_full_processing_pipeline(temp_dir: Path, sample_files: dict[str, Path]):
    """Test the complete file processing pipeline"""
    processor = DocumentProcessor(data_dir=str(temp_dir))

    # Mock embeddings and storage
    with patch.object(processor, 'store_documents') as mock_store:
        mock_store.return_value = True

        # Test with TXT file (easiest to mock)
        txt_file = sample_files["txt"]
        result = processor.process_file(str(txt_file))

        assert result.success is True
        assert result.document_id == "test.txt"
        assert result.metadata is not None
        assert "chunks" in result.metadata
        mock_store.assert_called_once()