feat: implement OCR parsing status tracking and biomarker count display

This commit is contained in:
2025-12-21 13:33:39 +05:30
parent fc6376abec
commit 0f277d6b3d
4 changed files with 126 additions and 59 deletions

View File

@@ -47,6 +47,7 @@ fn get_page_count(file_path: &PathBuf) -> usize {
}
/// POST /api/sources/:id/parse - Parse a source document using Mistral OCR.
/// Returns immediately with "processing" status; OCR runs in background.
pub async fn parse_source(
State(state): State<OcrState>,
Path(id): Path<i32>,
@@ -72,22 +73,63 @@ pub async fn parse_source(
)
})?;
// Check if already being processed
if source_entity.status == "processing" {
return Ok(Json(ParseResponse {
success: true,
biomarkers_count: 0,
message: "Already processing".to_string(),
}));
}
let file_path = PathBuf::from(&source_entity.file_path);
// 2. Upload file to Mistral
let file_id = mistral::upload_to_mistral(&state.mistral, &file_path).await.map_err(|e| {
// 2. Set status to "processing" immediately
let mut active_model: source::ActiveModel = source_entity.into();
active_model.status = Set("processing".to_string());
active_model.update(&state.db).await.map_err(|e| {
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("Mistral upload failed: {}", e),
error: format!("Database update failed: {}", e),
}),
)
})?;
// 3. Get page count locally from PDF
// 3. Spawn background task for OCR processing
let db = state.db.clone();
let mistral_config = state.mistral.clone();
tokio::spawn(async move {
if let Err(e) = process_ocr_background(db, mistral_config, id, file_path).await {
tracing::error!("Background OCR failed for source {}: {}", id, e);
}
});
// 4. Return immediately
Ok(Json(ParseResponse {
success: true,
biomarkers_count: 0,
message: "Processing started".to_string(),
}))
}
/// Background OCR processing task
async fn process_ocr_background(
db: sea_orm::DatabaseConnection,
mistral_config: crate::config::MistralConfig,
source_id: i32,
file_path: PathBuf,
) -> Result<(), String> {
// Upload file to Mistral
let file_id = mistral::upload_to_mistral(&mistral_config, &file_path)
.await
.map_err(|e| format!("Mistral upload failed: {}", e))?;
// Get page count locally from PDF
let max_pages = get_page_count(&file_path);
let chunk_size = state.mistral.max_pages_per_request as usize;
let max_retries = state.mistral.max_retries;
let chunk_size = mistral_config.max_pages_per_request as usize;
let max_retries = mistral_config.max_retries;
let mut all_results: Vec<types::DocumentAnnotation> = Vec::new();
let mut failed_chunk: Option<String> = None;
@@ -101,7 +143,7 @@ pub async fn parse_source(
let mut chunk_result = None;
while attempts <= max_retries {
match mistral::ocr_pages(&state.mistral, &file_id, &pages).await {
match mistral::ocr_pages(&mistral_config, &file_id, &pages).await {
Ok(annotation) => {
chunk_result = Some(annotation);
break;
@@ -132,69 +174,59 @@ pub async fn parse_source(
}
}
// Fail if any chunk failed
// Handle failure
if let Some(error_msg) = failed_chunk {
return Err((
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("OCR parsing failed: {}", error_msg),
}),
));
// Update status to failed
if let Ok(Some(entity)) = source::Entity::find_by_id(source_id).one(&db).await {
let mut active_model: source::ActiveModel = entity.into();
active_model.status = Set("failed".to_string());
let _ = active_model.update(&db).await;
}
return Err(format!("OCR parsing failed: {}", error_msg));
}
if all_results.is_empty() {
return Err((
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: "No OCR results obtained".to_string(),
}),
));
// Update status to failed
if let Ok(Some(entity)) = source::Entity::find_by_id(source_id).one(&db).await {
let mut active_model: source::ActiveModel = entity.into();
active_model.status = Set("failed".to_string());
let _ = active_model.update(&db).await;
}
return Err("No OCR results obtained".to_string());
}
// 4. Get valid biomarker names from schema
let valid_biomarkers = schema::extract_valid_biomarker_names().map_err(|e| {
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("Failed to read schema: {}", e),
}),
)
})?;
// Get valid biomarker names from schema
let valid_biomarkers = schema::extract_valid_biomarker_names()
.map_err(|e| format!("Failed to read schema: {}", e))?;
tracing::info!("Loaded {} valid biomarker names from schema", valid_biomarkers.len());
// 5. Merge results with fuzzy matching
// Merge results with fuzzy matching
let merged = matching::merge_results(all_results, &valid_biomarkers);
// 6. Save to database
let ocr_json = serde_json::to_string(&merged).map_err(|e| {
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("JSON serialization failed: {}", e),
}),
)
})?;
// Save to database
let ocr_json = serde_json::to_string(&merged)
.map_err(|e| format!("JSON serialization failed: {}", e))?;
let source_entity = source::Entity::find_by_id(source_id)
.one(&db)
.await
.map_err(|e| format!("Database error: {}", e))?
.ok_or_else(|| "Source not found".to_string())?;
let mut active_model: source::ActiveModel = source_entity.into();
active_model.ocr_data = Set(Some(ocr_json));
active_model.status = Set("parsed".to_string());
active_model.biomarker_count = Set(Some(merged.biomarkers.len() as i32));
active_model.update(&state.db).await.map_err(|e| {
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("Database update failed: {}", e),
}),
)
})?;
active_model.update(&db).await
.map_err(|e| format!("Database update failed: {}", e))?;
Ok(Json(ParseResponse {
success: true,
biomarkers_count: merged.biomarkers.len(),
message: format!(
"Successfully parsed {} biomarkers for {}",
merged.biomarkers.len(),
merged.patient_name.unwrap_or_else(|| "Unknown".to_string())
),
}))
tracing::info!(
"Successfully parsed {} biomarkers for source {}",
merged.biomarkers.len(),
source_id
);
Ok(())
}

View File

@@ -24,6 +24,8 @@ pub struct SourceResponse {
pub file_path: String,
pub file_type: String,
pub file_size: i64,
pub status: String,
pub biomarker_count: Option<i32>,
pub ocr_data: Option<String>,
pub description: Option<String>,
pub uploaded_at: String,
@@ -55,6 +57,8 @@ pub async fn list_sources(
file_path: s.file_path,
file_type: s.file_type,
file_size: s.file_size,
status: s.status,
biomarker_count: s.biomarker_count,
ocr_data: s.ocr_data,
description: s.description,
uploaded_at: s.uploaded_at.to_string(),
@@ -82,6 +86,8 @@ pub async fn get_source(
file_path: s.file_path,
file_type: s.file_type,
file_size: s.file_size,
status: s.status,
biomarker_count: s.biomarker_count,
ocr_data: s.ocr_data,
description: s.description,
uploaded_at: s.uploaded_at.to_string(),
@@ -166,6 +172,8 @@ pub async fn upload_source(
file_path: Set(file_path.to_string_lossy().to_string()),
file_type: Set(content_type.clone()),
file_size: Set(file_size),
status: Set("pending".to_string()),
biomarker_count: Set(None),
ocr_data: Set(None),
description: Set(description.clone()),
uploaded_at: Set(now),
@@ -187,6 +195,8 @@ pub async fn upload_source(
file_path: inserted.file_path,
file_type: inserted.file_type,
file_size: inserted.file_size,
status: inserted.status,
biomarker_count: inserted.biomarker_count,
ocr_data: inserted.ocr_data,
description: inserted.description,
uploaded_at: inserted.uploaded_at.to_string(),
@@ -256,6 +266,8 @@ pub async fn update_ocr(
file_path: updated.file_path,
file_type: updated.file_type,
file_size: updated.file_size,
status: updated.status,
biomarker_count: updated.biomarker_count,
ocr_data: updated.ocr_data,
description: updated.description,
uploaded_at: updated.uploaded_at.to_string(),

View File

@@ -27,6 +27,14 @@ pub struct Model {
/// File size in bytes
pub file_size: i64,
/// Parsing status: "pending", "processing", "parsed", "failed"
#[sea_orm(column_type = "Text")]
pub status: String,
/// Number of biomarkers extracted (populated after parsing)
#[sea_orm(nullable)]
pub biomarker_count: Option<i32>,
/// OCR parsed data as JSON
#[sea_orm(column_type = "Text", nullable)]
pub ocr_data: Option<String>,