380 lines
16 KiB
Python
380 lines
16 KiB
Python
"""
|
|
Comprehensive tests for document ingestion module using pytest
|
|
"""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from clm_system.ingestion import DocumentProcessor
|
|
from clm_system.ingestion import ProcessingResult
|
|
|
|
|
|
class TestProcessingResult:
|
|
"""Test ProcessingResult dataclass"""
|
|
|
|
def test_processing_result_creation(self):
|
|
"""Test ProcessingResult creation with all fields"""
|
|
result = ProcessingResult(
|
|
success=True,
|
|
document_id="test.pdf",
|
|
error=None,
|
|
metadata={"chunks": 5, "file_size": 1024}
|
|
)
|
|
|
|
assert result.success is True
|
|
assert result.document_id == "test.pdf"
|
|
assert result.error is None
|
|
assert result.metadata is not None
|
|
assert result.metadata["chunks"] == 5
|
|
assert result.metadata["file_size"] == 1024
|
|
|
|
def test_processing_result_minimal(self):
|
|
"""Test ProcessingResult creation with minimal fields"""
|
|
result = ProcessingResult(success=False)
|
|
|
|
assert result.success is False
|
|
assert result.document_id is None
|
|
assert result.error is None
|
|
assert result.metadata is None
|
|
|
|
def test_processing_result_with_error(self):
|
|
"""Test ProcessingResult with error message"""
|
|
result = ProcessingResult(
|
|
success=False,
|
|
error="File not found"
|
|
)
|
|
|
|
assert result.success is False
|
|
assert result.error == "File not found"
|
|
|
|
|
|
class TestDocumentProcessor:
|
|
"""Test DocumentProcessor class"""
|
|
|
|
def test_initialization(self, temp_dir: Path):
|
|
"""Test processor initialization"""
|
|
processor = DocumentProcessor(data_dir=str(temp_dir))
|
|
|
|
assert processor is not None
|
|
assert processor.data_dir == str(temp_dir)
|
|
assert processor.db_path == os.path.join(temp_dir, "lancedb")
|
|
assert processor.text_splitter is not None
|
|
|
|
@patch('clm_system.ingestion.OpenAIEmbeddings')
|
|
def test_embeddings_initialization_success(self, mock_embeddings_class, temp_dir: str):
|
|
"""Test successful embeddings initialization"""
|
|
mock_embeddings = MagicMock()
|
|
mock_embeddings_class.return_value = mock_embeddings
|
|
|
|
processor = DocumentProcessor(data_dir=str(temp_dir))
|
|
assert processor.embeddings is not None
|
|
|
|
@patch('clm_system.ingestion.OpenAIEmbeddings')
|
|
@patch('clm_system.ingestion.HuggingFaceEmbeddings')
|
|
@patch('clm_system.ingestion.GoogleGenerativeAIEmbeddings')
|
|
@patch('clm_system.ingestion.config')
|
|
def test_embeddings_initialization_failure(self, mock_config, mock_google_embeddings_class, mock_hf_embeddings_class, mock_openai_embeddings_class, temp_dir: str):
|
|
"""Test embeddings initialization failure handling"""
|
|
# Mock config to use openai model
|
|
mock_config.EMBEDDING_MODEL = "openai"
|
|
mock_config.OPENAI_EMBEDDING_MODEL = "text-embedding-ada-002"
|
|
|
|
# Mock all embeddings to fail
|
|
mock_openai_embeddings_class.side_effect = Exception("API key not found")
|
|
mock_hf_embeddings_class.side_effect = Exception("API key not found")
|
|
mock_google_embeddings_class.side_effect = Exception("API key not found")
|
|
|
|
processor = DocumentProcessor(data_dir=str(temp_dir))
|
|
assert processor.embeddings is None
|
|
|
|
def test_process_uploads_success(self, processor_with_mock_embeddings: DocumentProcessor):
|
|
"""Test processing multiple uploaded files successfully"""
|
|
# Create mock uploaded files
|
|
uploaded_files = []
|
|
for i in range(3):
|
|
mock_file = MagicMock()
|
|
mock_file.name = f"test_{i}.txt"
|
|
mock_file.getbuffer.return_value = b"Test content"
|
|
uploaded_files.append(mock_file)
|
|
|
|
# Mock process_single_file to return success
|
|
processor_with_mock_embeddings.process_single_file = MagicMock(return_value=ProcessingResult(
|
|
success=True,
|
|
document_id="test.txt"
|
|
))
|
|
|
|
result = processor_with_mock_embeddings.process_uploads(uploaded_files)
|
|
|
|
assert result["success"] is True
|
|
assert result["count"] == 3
|
|
assert len(result["results"]) == 3
|
|
|
|
def test_process_uploads_with_failure(self, processor_with_mock_embeddings: DocumentProcessor):
|
|
"""Test processing uploaded files with some failures"""
|
|
uploaded_files = []
|
|
for i in range(2):
|
|
mock_file = MagicMock()
|
|
mock_file.name = f"test_{i}.txt"
|
|
mock_file.getbuffer.return_value = b"Test content"
|
|
uploaded_files.append(mock_file)
|
|
|
|
# Mock process_single_file to return mixed results
|
|
processor_with_mock_embeddings.process_single_file = MagicMock(side_effect=[
|
|
ProcessingResult(success=True, document_id="test_0.txt"),
|
|
ProcessingResult(success=False, error="Processing failed")
|
|
])
|
|
|
|
result = processor_with_mock_embeddings.process_uploads(uploaded_files)
|
|
|
|
assert result["success"] is True # At least one success
|
|
assert result["count"] == 1
|
|
assert len(result["results"]) == 2
|
|
|
|
def test_process_single_file_success(self, processor_with_mock_embeddings: DocumentProcessor):
|
|
"""Test processing a single uploaded file successfully"""
|
|
mock_file = MagicMock()
|
|
mock_file.name = "test.txt"
|
|
mock_file.getbuffer.return_value = b"Test content"
|
|
|
|
# Mock process_file to return success
|
|
processor_with_mock_embeddings.process_file = MagicMock(return_value=ProcessingResult(
|
|
success=True,
|
|
document_id="test.txt"
|
|
))
|
|
|
|
result = processor_with_mock_embeddings.process_single_file(mock_file)
|
|
|
|
assert result.success is True
|
|
assert result.document_id == "test.txt"
|
|
|
|
def test_process_single_file_error(self, processor_with_mock_embeddings: DocumentProcessor):
|
|
"""Test processing a single uploaded file with error"""
|
|
mock_file = MagicMock()
|
|
mock_file.name = "test.txt"
|
|
mock_file.getbuffer = MagicMock(side_effect=Exception("File read error"))
|
|
|
|
result = processor_with_mock_embeddings.process_single_file(mock_file)
|
|
|
|
assert result.success is False
|
|
assert result.error is not None
|
|
assert "File read error" in result.error
|
|
|
|
def test_process_file_empty_content(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test processing file with empty content"""
|
|
file_path = temp_dir / "empty.txt"
|
|
file_path.write_text("")
|
|
|
|
result = processor_with_mock_embeddings.process_file(str(file_path))
|
|
|
|
assert result.success is False
|
|
assert result.error is not None
|
|
assert "No text content found" in result.error
|
|
|
|
def test_extract_text_pdf(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test PDF text extraction"""
|
|
# Create a test PDF file
|
|
pdf_path = temp_dir / "test.pdf"
|
|
pdf_path.write_text("dummy pdf content") # Create dummy file
|
|
|
|
# Mock PyPDF2 to return test content
|
|
with patch('PyPDF2.PdfReader') as mock_pdf_reader:
|
|
mock_page = MagicMock()
|
|
mock_page.extract_text.return_value = "PDF test content"
|
|
mock_pdf_reader.return_value.pages = [mock_page]
|
|
|
|
text = processor_with_mock_embeddings.extract_pdf_text(str(pdf_path))
|
|
|
|
assert text == "PDF test content\n"
|
|
|
|
def test_extract_text_pdf_with_ocr_fallback(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test PDF text extraction with OCR fallback"""
|
|
pdf_path = temp_dir / "test.pdf"
|
|
# Mock PyPDF2 to raise exception, then OCR to return content
|
|
with patch('PyPDF2.PdfReader') as mock_pdf_reader, \
|
|
patch.object(processor_with_mock_embeddings, 'ocr_pdf') as mock_ocr:
|
|
|
|
mock_pdf_reader.side_effect = Exception("PDF read error")
|
|
mock_ocr.return_value = "OCR content"
|
|
|
|
text = processor_with_mock_embeddings.extract_pdf_text(str(pdf_path))
|
|
|
|
assert text == "OCR content"
|
|
|
|
def test_extract_text_docx(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test DOCX text extraction"""
|
|
docx_path = temp_dir / "test.docx"
|
|
|
|
# Mock python-docx at the module level
|
|
with patch('clm_system.ingestion.Document') as mock_document:
|
|
mock_doc = MagicMock()
|
|
mock_doc.paragraphs = [MagicMock(text="DOCX test content")]
|
|
mock_document.return_value = mock_doc
|
|
|
|
text = processor_with_mock_embeddings.extract_docx_text(str(docx_path))
|
|
|
|
assert text == "DOCX test content"
|
|
|
|
def test_extract_text_txt(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test TXT text extraction"""
|
|
txt_path = temp_dir / "test.txt"
|
|
txt_path.write_text("TXT test content")
|
|
|
|
text = processor_with_mock_embeddings.extract_txt_text(str(txt_path))
|
|
|
|
assert text == "TXT test content"
|
|
|
|
def test_extract_text_unsupported_type(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test extraction of unsupported file type"""
|
|
unsupported_path = temp_dir / "test.jpg"
|
|
unsupported_path.write_text("fake image content")
|
|
|
|
with pytest.raises(ValueError) as exc_info:
|
|
processor_with_mock_embeddings.extract_text(str(unsupported_path))
|
|
|
|
assert "Unsupported file type" in str(exc_info.value)
|
|
|
|
def test_ocr_pdf_placeholder(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test OCR PDF placeholder implementation"""
|
|
pdf_path = temp_dir / "test.pdf"
|
|
text = processor_with_mock_embeddings.ocr_pdf(str(pdf_path))
|
|
assert text == "" # Placeholder returns empty string
|
|
|
|
def test_store_documents_success(self, processor_with_mock_embeddings: DocumentProcessor, sample_documents: list):
|
|
"""Test successful document storage in LanceDB"""
|
|
# Mock LanceDB operations
|
|
with patch.object(processor_with_mock_embeddings.db, 'table_names', return_value=[]):
|
|
with patch.object(processor_with_mock_embeddings.db, 'create_table') as mock_create:
|
|
mock_table = MagicMock()
|
|
mock_create.return_value = mock_table
|
|
|
|
result = processor_with_mock_embeddings.store_documents(sample_documents)
|
|
assert result is True
|
|
mock_create.assert_called_once()
|
|
|
|
def test_store_documents_no_embeddings(self, processor_with_mock_embeddings: DocumentProcessor, sample_documents: list):
|
|
"""Test document storage when embeddings are not available"""
|
|
processor_with_mock_embeddings.embeddings = None
|
|
|
|
result = processor_with_mock_embeddings.store_documents(sample_documents)
|
|
assert result is False
|
|
|
|
def test_store_documents_exception(self, temp_dir: Path, sample_documents: list):
|
|
"""Test document storage with exception"""
|
|
processor = DocumentProcessor(data_dir=str(temp_dir))
|
|
processor.embeddings = MagicMock()
|
|
processor.embeddings.embed_documents.side_effect = Exception("Embedding error")
|
|
|
|
result = processor.store_documents(sample_documents)
|
|
assert result is False
|
|
|
|
def test_get_table_exists(self, processor_with_mock_embeddings: DocumentProcessor):
|
|
"""Test getting existing table"""
|
|
with patch.object(processor_with_mock_embeddings.db, 'table_names', return_value=['contracts']):
|
|
with patch.object(processor_with_mock_embeddings.db, 'open_table') as mock_open:
|
|
mock_table = MagicMock()
|
|
mock_open.return_value = mock_table
|
|
|
|
table = processor_with_mock_embeddings.get_table('contracts')
|
|
assert table == mock_table
|
|
|
|
def test_get_table_not_exists(self, processor_with_mock_embeddings: DocumentProcessor):
|
|
"""Test getting non-existent table"""
|
|
with patch.object(processor_with_mock_embeddings.db, 'table_names', return_value=[]):
|
|
table = processor_with_mock_embeddings.get_table('nonexistent')
|
|
assert table is None
|
|
|
|
|
|
class TestDocumentProcessorEdgeCases:
|
|
"""Test edge cases and error conditions"""
|
|
|
|
def test_process_file_nonexistent(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test processing non-existent file"""
|
|
nonexistent_path = temp_dir / "nonexistent.pdf"
|
|
|
|
result = processor_with_mock_embeddings.process_file(str(nonexistent_path))
|
|
|
|
assert result.success is False
|
|
assert result.error is not None
|
|
|
|
def test_extract_pdf_text_exception(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test PDF text extraction with exception"""
|
|
pdf_path = temp_dir / "test.pdf"
|
|
# Mock PyPDF2 to raise exception
|
|
with patch('PyPDF2.PdfReader') as mock_pdf_reader:
|
|
mock_pdf_reader.side_effect = Exception("PDF read error")
|
|
|
|
text = processor_with_mock_embeddings.extract_pdf_text(str(pdf_path))
|
|
|
|
assert text == "" # Should return empty string on error
|
|
|
|
def test_extract_docx_text_exception(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test DOCX text extraction with exception"""
|
|
docx_path = temp_dir / "test.docx"
|
|
|
|
with patch('clm_system.ingestion.Document') as mock_document:
|
|
mock_document.side_effect = ValueError("DOCX corrupted")
|
|
|
|
with pytest.raises(ValueError):
|
|
processor_with_mock_embeddings.extract_docx_text(str(docx_path))
|
|
|
|
def test_extract_txt_text_exception(self, processor_with_mock_embeddings: DocumentProcessor, temp_dir: Path):
|
|
"""Test TXT text extraction with exception"""
|
|
txt_path = temp_dir / "test.txt"
|
|
txt_path.write_text("content")
|
|
|
|
with patch('builtins.open') as mock_open:
|
|
mock_open.side_effect = PermissionError("Permission denied")
|
|
|
|
with pytest.raises(PermissionError):
|
|
processor_with_mock_embeddings.extract_txt_text(str(txt_path))
|
|
|
|
|
|
# Parametrized tests for different file types
|
|
@pytest.mark.parametrize("file_extension,file_content,expected_method", [
|
|
(".pdf", b"PDF content", "extract_pdf_text"),
|
|
(".docx", b"DOCX content", "extract_docx_text"),
|
|
(".txt", "TXT content", "extract_txt_text"),
|
|
])
|
|
def test_extract_text_by_type(temp_dir: Path, file_extension: str, file_content, expected_method: str):
|
|
"""Test text extraction for different file types"""
|
|
processor = DocumentProcessor(data_dir=str(temp_dir))
|
|
file_path = temp_dir / f"test{file_extension}"
|
|
|
|
# Create file
|
|
if isinstance(file_content, str):
|
|
file_path.write_text(file_content)
|
|
else:
|
|
file_path.write_bytes(file_content)
|
|
|
|
# Mock the specific extraction method
|
|
with patch.object(processor, expected_method) as mock_method:
|
|
mock_method.return_value = f"Extracted {file_extension} content"
|
|
|
|
result = processor.extract_text(str(file_path))
|
|
assert result == f"Extracted {file_extension} content"
|
|
mock_method.assert_called_once_with(str(file_path))
|
|
|
|
|
|
# Test for file processing pipeline
|
|
def test_full_processing_pipeline(temp_dir: Path, sample_files: dict[str, Path]):
|
|
"""Test the complete file processing pipeline"""
|
|
processor = DocumentProcessor(data_dir=str(temp_dir))
|
|
|
|
# Mock embeddings and storage
|
|
with patch.object(processor, 'store_documents') as mock_store:
|
|
mock_store.return_value = True
|
|
|
|
# Test with TXT file (easiest to mock)
|
|
txt_file = sample_files["txt"]
|
|
result = processor.process_file(str(txt_file))
|
|
|
|
assert result.success is True
|
|
assert result.document_id == "test.txt"
|
|
assert result.metadata is not None
|
|
assert "chunks" in result.metadata
|
|
mock_store.assert_called_once()
|