diff --git a/backend/src/handlers/ocr/mod.rs b/backend/src/handlers/ocr/mod.rs index 79507b0..4b0bff2 100644 --- a/backend/src/handlers/ocr/mod.rs +++ b/backend/src/handlers/ocr/mod.rs @@ -47,6 +47,7 @@ fn get_page_count(file_path: &PathBuf) -> usize { } /// POST /api/sources/:id/parse - Parse a source document using Mistral OCR. +/// Returns immediately with "processing" status; OCR runs in background. pub async fn parse_source( State(state): State, Path(id): Path, @@ -72,22 +73,63 @@ pub async fn parse_source( ) })?; + // Check if already being processed + if source_entity.status == "processing" { + return Ok(Json(ParseResponse { + success: true, + biomarkers_count: 0, + message: "Already processing".to_string(), + })); + } + let file_path = PathBuf::from(&source_entity.file_path); - // 2. Upload file to Mistral - let file_id = mistral::upload_to_mistral(&state.mistral, &file_path).await.map_err(|e| { + // 2. Set status to "processing" immediately + let mut active_model: source::ActiveModel = source_entity.into(); + active_model.status = Set("processing".to_string()); + active_model.update(&state.db).await.map_err(|e| { ( StatusCode::INTERNAL_SERVER_ERROR, Json(ErrorResponse { - error: format!("Mistral upload failed: {}", e), + error: format!("Database update failed: {}", e), }), ) })?; - // 3. Get page count locally from PDF + // 3. Spawn background task for OCR processing + let db = state.db.clone(); + let mistral_config = state.mistral.clone(); + + tokio::spawn(async move { + if let Err(e) = process_ocr_background(db, mistral_config, id, file_path).await { + tracing::error!("Background OCR failed for source {}: {}", id, e); + } + }); + + // 4. Return immediately + Ok(Json(ParseResponse { + success: true, + biomarkers_count: 0, + message: "Processing started".to_string(), + })) +} + +/// Background OCR processing task +async fn process_ocr_background( + db: sea_orm::DatabaseConnection, + mistral_config: crate::config::MistralConfig, + source_id: i32, + file_path: PathBuf, +) -> Result<(), String> { + // Upload file to Mistral + let file_id = mistral::upload_to_mistral(&mistral_config, &file_path) + .await + .map_err(|e| format!("Mistral upload failed: {}", e))?; + + // Get page count locally from PDF let max_pages = get_page_count(&file_path); - let chunk_size = state.mistral.max_pages_per_request as usize; - let max_retries = state.mistral.max_retries; + let chunk_size = mistral_config.max_pages_per_request as usize; + let max_retries = mistral_config.max_retries; let mut all_results: Vec = Vec::new(); let mut failed_chunk: Option = None; @@ -101,7 +143,7 @@ pub async fn parse_source( let mut chunk_result = None; while attempts <= max_retries { - match mistral::ocr_pages(&state.mistral, &file_id, &pages).await { + match mistral::ocr_pages(&mistral_config, &file_id, &pages).await { Ok(annotation) => { chunk_result = Some(annotation); break; @@ -132,69 +174,59 @@ pub async fn parse_source( } } - // Fail if any chunk failed + // Handle failure if let Some(error_msg) = failed_chunk { - return Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("OCR parsing failed: {}", error_msg), - }), - )); + // Update status to failed + if let Ok(Some(entity)) = source::Entity::find_by_id(source_id).one(&db).await { + let mut active_model: source::ActiveModel = entity.into(); + active_model.status = Set("failed".to_string()); + let _ = active_model.update(&db).await; + } + return Err(format!("OCR parsing failed: {}", error_msg)); } if all_results.is_empty() { - return Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: "No OCR results obtained".to_string(), - }), - )); + // Update status to failed + if let Ok(Some(entity)) = source::Entity::find_by_id(source_id).one(&db).await { + let mut active_model: source::ActiveModel = entity.into(); + active_model.status = Set("failed".to_string()); + let _ = active_model.update(&db).await; + } + return Err("No OCR results obtained".to_string()); } - // 4. Get valid biomarker names from schema - let valid_biomarkers = schema::extract_valid_biomarker_names().map_err(|e| { - ( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to read schema: {}", e), - }), - ) - })?; + // Get valid biomarker names from schema + let valid_biomarkers = schema::extract_valid_biomarker_names() + .map_err(|e| format!("Failed to read schema: {}", e))?; tracing::info!("Loaded {} valid biomarker names from schema", valid_biomarkers.len()); - // 5. Merge results with fuzzy matching + // Merge results with fuzzy matching let merged = matching::merge_results(all_results, &valid_biomarkers); - // 6. Save to database - let ocr_json = serde_json::to_string(&merged).map_err(|e| { - ( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("JSON serialization failed: {}", e), - }), - ) - })?; + // Save to database + let ocr_json = serde_json::to_string(&merged) + .map_err(|e| format!("JSON serialization failed: {}", e))?; + + let source_entity = source::Entity::find_by_id(source_id) + .one(&db) + .await + .map_err(|e| format!("Database error: {}", e))? + .ok_or_else(|| "Source not found".to_string())?; let mut active_model: source::ActiveModel = source_entity.into(); active_model.ocr_data = Set(Some(ocr_json)); + active_model.status = Set("parsed".to_string()); + active_model.biomarker_count = Set(Some(merged.biomarkers.len() as i32)); - active_model.update(&state.db).await.map_err(|e| { - ( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Database update failed: {}", e), - }), - ) - })?; + active_model.update(&db).await + .map_err(|e| format!("Database update failed: {}", e))?; - Ok(Json(ParseResponse { - success: true, - biomarkers_count: merged.biomarkers.len(), - message: format!( - "Successfully parsed {} biomarkers for {}", - merged.biomarkers.len(), - merged.patient_name.unwrap_or_else(|| "Unknown".to_string()) - ), - })) + tracing::info!( + "Successfully parsed {} biomarkers for source {}", + merged.biomarkers.len(), + source_id + ); + + Ok(()) } diff --git a/backend/src/handlers/sources.rs b/backend/src/handlers/sources.rs index 92292bf..1f99405 100644 --- a/backend/src/handlers/sources.rs +++ b/backend/src/handlers/sources.rs @@ -24,6 +24,8 @@ pub struct SourceResponse { pub file_path: String, pub file_type: String, pub file_size: i64, + pub status: String, + pub biomarker_count: Option, pub ocr_data: Option, pub description: Option, pub uploaded_at: String, @@ -55,6 +57,8 @@ pub async fn list_sources( file_path: s.file_path, file_type: s.file_type, file_size: s.file_size, + status: s.status, + biomarker_count: s.biomarker_count, ocr_data: s.ocr_data, description: s.description, uploaded_at: s.uploaded_at.to_string(), @@ -82,6 +86,8 @@ pub async fn get_source( file_path: s.file_path, file_type: s.file_type, file_size: s.file_size, + status: s.status, + biomarker_count: s.biomarker_count, ocr_data: s.ocr_data, description: s.description, uploaded_at: s.uploaded_at.to_string(), @@ -166,6 +172,8 @@ pub async fn upload_source( file_path: Set(file_path.to_string_lossy().to_string()), file_type: Set(content_type.clone()), file_size: Set(file_size), + status: Set("pending".to_string()), + biomarker_count: Set(None), ocr_data: Set(None), description: Set(description.clone()), uploaded_at: Set(now), @@ -187,6 +195,8 @@ pub async fn upload_source( file_path: inserted.file_path, file_type: inserted.file_type, file_size: inserted.file_size, + status: inserted.status, + biomarker_count: inserted.biomarker_count, ocr_data: inserted.ocr_data, description: inserted.description, uploaded_at: inserted.uploaded_at.to_string(), @@ -256,6 +266,8 @@ pub async fn update_ocr( file_path: updated.file_path, file_type: updated.file_type, file_size: updated.file_size, + status: updated.status, + biomarker_count: updated.biomarker_count, ocr_data: updated.ocr_data, description: updated.description, uploaded_at: updated.uploaded_at.to_string(), diff --git a/backend/src/models/bio/source.rs b/backend/src/models/bio/source.rs index 9c13520..e6b609f 100644 --- a/backend/src/models/bio/source.rs +++ b/backend/src/models/bio/source.rs @@ -27,6 +27,14 @@ pub struct Model { /// File size in bytes pub file_size: i64, + /// Parsing status: "pending", "processing", "parsed", "failed" + #[sea_orm(column_type = "Text")] + pub status: String, + + /// Number of biomarkers extracted (populated after parsing) + #[sea_orm(nullable)] + pub biomarker_count: Option, + /// OCR parsed data as JSON #[sea_orm(column_type = "Text", nullable)] pub ocr_data: Option, diff --git a/frontend/src/pages/Sources.tsx b/frontend/src/pages/Sources.tsx index e843196..f1b9189 100644 --- a/frontend/src/pages/Sources.tsx +++ b/frontend/src/pages/Sources.tsx @@ -7,6 +7,8 @@ interface Source { file_path: string file_type: string file_size: number + status: string + biomarker_count: number | null ocr_data: string | null description: string | null uploaded_at: string @@ -217,17 +219,30 @@ export function SourcesPage() {
- {source.ocr_data ? ( + {source.status === 'parsed' ? ( - Parsed Parsed + Parsed + {source.biomarker_count ? `${source.biomarker_count} biomarkers` : 'Parsed'} + ) : source.status === 'processing' ? ( + + Processing... + + ) : source.status === 'failed' ? ( + ) : ( )}