Files
clm-system/clm-system/tests/test_ingestion.py

380 lines
16 KiB
Python

"""
Comprehensive tests for document ingestion module using pytest
"""
import os
from pathlib import Path
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest
from clm_system.ingestion import DocumentProcessor
from clm_system.ingestion import ProcessingResult
class TestProcessingResult:
"""Test ProcessingResult dataclass"""
def test_processing_result_creation(self):
"""Test ProcessingResult creation with all fields"""
result = ProcessingResult(
success=True,
document_id="test.pdf",
error=None,
metadata={"chunks": 5, "file_size": 1024}
)
assert result.success is True
assert result.document_id == "test.pdf"
assert result.error is None
assert result.metadata is not None
assert result.metadata["chunks"] == 5
assert result.metadata["file_size"] == 1024
def test_processing_result_minimal(self):
"""Test ProcessingResult creation with minimal fields"""
result = ProcessingResult(success=False)
assert result.success is False
assert result.document_id is None
assert result.error is None
assert result.metadata is None
def test_processing_result_with_error(self):
"""Test ProcessingResult with error message"""
result = ProcessingResult(
success=False,
error="File not found"
)
assert result.success is False
assert result.error == "File not found"
class TestDocumentProcessor:
"""Test DocumentProcessor class"""
def test_initialization(self, temp_dir: Path):
"""Test processor initialization"""
processor = DocumentProcessor(data_dir=str(temp_dir))
assert processor is not None
assert processor.data_dir == str(temp_dir)
assert processor.db_path == os.path.join(temp_dir, "lancedb")
assert processor.text_splitter is not None
@patch('clm_system.ingestion.OpenAIEmbeddings')
def test_embeddings_initialization_success(self, mock_embeddings_class, temp_dir: str):
"""Test successful embeddings initialization"""
mock_embeddings = MagicMock()
mock_embeddings_class.return_value = mock_embeddings
processor = DocumentProcessor(data_dir=str(temp_dir))
assert processor.embeddings is not None
@patch('clm_system.ingestion.OpenAIEmbeddings')
@patch('clm_system.ingestion.HuggingFaceEmbeddings')
@patch('clm_system.ingestion.GoogleGenerativeAIEmbeddings')
@patch('clm_system.ingestion.config')
def test_embeddings_initialization_failure(self, mock_config, mock_google_embeddings_class, mock_hf_embeddings_class, mock_openai_embeddings_class, temp_dir: str):
"""Test embeddings initialization failure handling"""
# Mock config to use openai model
mock_config.EMBEDDING_MODEL = "openai"
mock_config.OPENAI_EMBEDDING_MODEL = "text-embedding-ada-002"
# Mock all embeddings to fail
mock_openai_embeddings_class.side_effect = Exception("API key not found")
mock_hf_embeddings_class.side_effect = Exception("API key not found")
mock_google_embeddings_class.side_effect = Exception("API key not found")
processor = DocumentProcessor(data_dir=str(temp_dir))
assert processor.embeddings is None
def test_process_uploads_success(self, processor_with_mock_embeddings: DocumentProcessor):
"""Test processing multiple uploaded files successfully"""
# Create mock uploaded files
uploaded_files = []
for i in range(3):
mock_file = MagicMock()
mock_file.name = f"test_{i}.txt"
mock_file.getbuffer.return_value = b"Test content"
uploaded_files.append(mock_file)
# Mock process_single_file to return success
processor_with_mock_embeddings.process_single_file = MagicMock(return_value=ProcessingResult(
success=True,
document_id="test.txt"
))
result = processor_with_mock_embeddings.process_uploads(uploaded_files)
assert result["success"] is True
assert result["count"] == 3
assert len(result["results"]) == 3
def test_process_uploads_with_failure(self, processor_with_mock_embeddings: DocumentProcessor):
"""Test processing uploaded files with some failures"""
uploaded_files = []
for i in range(2):
mock_file = MagicMock()
mock_file.name = f"test_{i}.txt"
mock_file.getbuffer.return_value = b"Test content"
uploaded_files.append(mock_file)
# Mock process_single_file to return mixed results
processor_with_mock_embeddings.process_single_file = MagicMock(side_effect=[
ProcessingResult(success=True, document_id="test_0.txt"),
ProcessingResult(success=False, error="Processing failed")
])
result = processor_with_mock_embeddings.process_uploads(uploaded_files)
assert result["success"] is True # At least one success
assert result["count"] == 1
assert len(result["results"]) == 2
def test_process_single_file_success(self, processor_with_mock_embeddings: DocumentProcessor):
"""Test processing a single uploaded file successfully"""
mock_file = MagicMock()
mock_file.name = "test.txt"
mock_file.getbuffer.return_value = b"Test content"
# Mock process_file to return success
processor_with_mock_embeddings.process_file = MagicMock(return_value=ProcessingResult(
success=True,
document_id="test.txt"
))
result = processor_with_mock_embeddings.process_single_file(mock_file)
assert result.success is True
assert result.document_id == "test.txt"
def test_process_single_file_error(self, processor_with_mock_embeddings: DocumentProcessor):
"""Test processing a single uploaded file with error"""
mock_file = MagicMock()
mock_file.name = "test.txt"
mock_file.getbuffer = MagicMock(side_effect=Exception("File read error"))
result = processor_with_mock_embeddings.process_single_file(mock_file)
assert result.success is False
assert result.error is not None
assert "File read error" in result.error
def test_process_file_empty_content(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test processing file with empty content"""
file_path = temp_dir / "empty.txt"
file_path.write_text("")
result = processor_with_mock_embeddings.process_file(str(file_path))
assert result.success is False
assert result.error is not None
assert "No text content found" in result.error
def test_extract_text_pdf(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test PDF text extraction"""
# Create a test PDF file
pdf_path = temp_dir / "test.pdf"
pdf_path.write_text("dummy pdf content") # Create dummy file
# Mock PyPDF2 to return test content
with patch('PyPDF2.PdfReader') as mock_pdf_reader:
mock_page = MagicMock()
mock_page.extract_text.return_value = "PDF test content"
mock_pdf_reader.return_value.pages = [mock_page]
text = processor_with_mock_embeddings.extract_pdf_text(str(pdf_path))
assert text == "PDF test content\n"
def test_extract_text_pdf_with_ocr_fallback(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test PDF text extraction with OCR fallback"""
pdf_path = temp_dir / "test.pdf"
# Mock PyPDF2 to raise exception, then OCR to return content
with patch('PyPDF2.PdfReader') as mock_pdf_reader, \
patch.object(processor_with_mock_embeddings, 'ocr_pdf') as mock_ocr:
mock_pdf_reader.side_effect = Exception("PDF read error")
mock_ocr.return_value = "OCR content"
text = processor_with_mock_embeddings.extract_pdf_text(str(pdf_path))
assert text == "OCR content"
def test_extract_text_docx(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test DOCX text extraction"""
docx_path = temp_dir / "test.docx"
# Mock python-docx at the module level
with patch('clm_system.ingestion.Document') as mock_document:
mock_doc = MagicMock()
mock_doc.paragraphs = [MagicMock(text="DOCX test content")]
mock_document.return_value = mock_doc
text = processor_with_mock_embeddings.extract_docx_text(str(docx_path))
assert text == "DOCX test content"
def test_extract_text_txt(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test TXT text extraction"""
txt_path = temp_dir / "test.txt"
txt_path.write_text("TXT test content")
text = processor_with_mock_embeddings.extract_txt_text(str(txt_path))
assert text == "TXT test content"
def test_extract_text_unsupported_type(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test extraction of unsupported file type"""
unsupported_path = temp_dir / "test.jpg"
unsupported_path.write_text("fake image content")
with pytest.raises(ValueError) as exc_info:
processor_with_mock_embeddings.extract_text(str(unsupported_path))
assert "Unsupported file type" in str(exc_info.value)
def test_ocr_pdf_placeholder(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test OCR PDF placeholder implementation"""
pdf_path = temp_dir / "test.pdf"
text = processor_with_mock_embeddings.ocr_pdf(str(pdf_path))
assert text == "" # Placeholder returns empty string
def test_store_documents_success(self, processor_with_mock_embeddings: DocumentProcessor, sample_documents: list):
"""Test successful document storage in LanceDB"""
# Mock LanceDB operations
with patch.object(processor_with_mock_embeddings.db, 'table_names', return_value=[]):
with patch.object(processor_with_mock_embeddings.db, 'create_table') as mock_create:
mock_table = MagicMock()
mock_create.return_value = mock_table
result = processor_with_mock_embeddings.store_documents(sample_documents)
assert result is True
mock_create.assert_called_once()
def test_store_documents_no_embeddings(self, processor_with_mock_embeddings: DocumentProcessor, sample_documents: list):
"""Test document storage when embeddings are not available"""
processor_with_mock_embeddings.embeddings = None
result = processor_with_mock_embeddings.store_documents(sample_documents)
assert result is False
def test_store_documents_exception(self, temp_dir: Path, sample_documents: list):
"""Test document storage with exception"""
processor = DocumentProcessor(data_dir=str(temp_dir))
processor.embeddings = MagicMock()
processor.embeddings.embed_documents.side_effect = Exception("Embedding error")
result = processor.store_documents(sample_documents)
assert result is False
def test_get_table_exists(self, processor_with_mock_embeddings: DocumentProcessor):
"""Test getting existing table"""
with patch.object(processor_with_mock_embeddings.db, 'table_names', return_value=['contracts']):
with patch.object(processor_with_mock_embeddings.db, 'open_table') as mock_open:
mock_table = MagicMock()
mock_open.return_value = mock_table
table = processor_with_mock_embeddings.get_table('contracts')
assert table == mock_table
def test_get_table_not_exists(self, processor_with_mock_embeddings: DocumentProcessor):
"""Test getting non-existent table"""
with patch.object(processor_with_mock_embeddings.db, 'table_names', return_value=[]):
table = processor_with_mock_embeddings.get_table('nonexistent')
assert table is None
class TestDocumentProcessorEdgeCases:
"""Test edge cases and error conditions"""
def test_process_file_nonexistent(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test processing non-existent file"""
nonexistent_path = temp_dir / "nonexistent.pdf"
result = processor_with_mock_embeddings.process_file(str(nonexistent_path))
assert result.success is False
assert result.error is not None
def test_extract_pdf_text_exception(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test PDF text extraction with exception"""
pdf_path = temp_dir / "test.pdf"
# Mock PyPDF2 to raise exception
with patch('PyPDF2.PdfReader') as mock_pdf_reader:
mock_pdf_reader.side_effect = Exception("PDF read error")
text = processor_with_mock_embeddings.extract_pdf_text(str(pdf_path))
assert text == "" # Should return empty string on error
def test_extract_docx_text_exception(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test DOCX text extraction with exception"""
docx_path = temp_dir / "test.docx"
with patch('clm_system.ingestion.Document') as mock_document:
mock_document.side_effect = ValueError("DOCX corrupted")
with pytest.raises(ValueError):
processor_with_mock_embeddings.extract_docx_text(str(docx_path))
def test_extract_txt_text_exception(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
"""Test TXT text extraction with exception"""
txt_path = temp_dir / "test.txt"
txt_path.write_text("content")
with patch('builtins.open') as mock_open:
mock_open.side_effect = PermissionError("Permission denied")
with pytest.raises(PermissionError):
processor_with_mock_embeddings.extract_txt_text(str(txt_path))
# Parametrized tests for different file types
@pytest.mark.parametrize("file_extension,file_content,expected_method", [
(".pdf", b"PDF content", "extract_pdf_text"),
(".docx", b"DOCX content", "extract_docx_text"),
(".txt", "TXT content", "extract_txt_text"),
])
def test_extract_text_by_type(temp_dir: Path, file_extension: str, file_content, expected_method: str):
"""Test text extraction for different file types"""
processor = DocumentProcessor(data_dir=str(temp_dir))
file_path = temp_dir / f"test{file_extension}"
# Create file
if isinstance(file_content, str):
file_path.write_text(file_content)
else:
file_path.write_bytes(file_content)
# Mock the specific extraction method
with patch.object(processor, expected_method) as mock_method:
mock_method.return_value = f"Extracted {file_extension} content"
result = processor.extract_text(str(file_path))
assert result == f"Extracted {file_extension} content"
mock_method.assert_called_once_with(str(file_path))
# Test for file processing pipeline
def test_full_processing_pipeline(temp_dir: Path, sample_files: dict[str, Path]):
"""Test the complete file processing pipeline"""
processor = DocumentProcessor(data_dir=str(temp_dir))
# Mock embeddings and storage
with patch.object(processor, 'store_documents') as mock_store:
mock_store.return_value = True
# Test with TXT file (easiest to mock)
txt_file = sample_files["txt"]
result = processor.process_file(str(txt_file))
assert result.success is True
assert result.document_id == "test.txt"
assert result.metadata is not None
assert "chunks" in result.metadata
mock_store.assert_called_once()