feat: implement OCR parsing status tracking and biomarker count display

This commit is contained in:
2025-12-21 13:33:39 +05:30
parent fc6376abec
commit 0f277d6b3d
4 changed files with 126 additions and 59 deletions

View File

@@ -47,6 +47,7 @@ fn get_page_count(file_path: &PathBuf) -> usize {
} }
/// POST /api/sources/:id/parse - Parse a source document using Mistral OCR. /// POST /api/sources/:id/parse - Parse a source document using Mistral OCR.
/// Returns immediately with "processing" status; OCR runs in background.
pub async fn parse_source( pub async fn parse_source(
State(state): State<OcrState>, State(state): State<OcrState>,
Path(id): Path<i32>, Path(id): Path<i32>,
@@ -72,22 +73,63 @@ pub async fn parse_source(
) )
})?; })?;
// Check if already being processed
if source_entity.status == "processing" {
return Ok(Json(ParseResponse {
success: true,
biomarkers_count: 0,
message: "Already processing".to_string(),
}));
}
let file_path = PathBuf::from(&source_entity.file_path); let file_path = PathBuf::from(&source_entity.file_path);
// 2. Upload file to Mistral // 2. Set status to "processing" immediately
let file_id = mistral::upload_to_mistral(&state.mistral, &file_path).await.map_err(|e| { let mut active_model: source::ActiveModel = source_entity.into();
active_model.status = Set("processing".to_string());
active_model.update(&state.db).await.map_err(|e| {
( (
StatusCode::INTERNAL_SERVER_ERROR, StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse { Json(ErrorResponse {
error: format!("Mistral upload failed: {}", e), error: format!("Database update failed: {}", e),
}), }),
) )
})?; })?;
// 3. Get page count locally from PDF // 3. Spawn background task for OCR processing
let db = state.db.clone();
let mistral_config = state.mistral.clone();
tokio::spawn(async move {
if let Err(e) = process_ocr_background(db, mistral_config, id, file_path).await {
tracing::error!("Background OCR failed for source {}: {}", id, e);
}
});
// 4. Return immediately
Ok(Json(ParseResponse {
success: true,
biomarkers_count: 0,
message: "Processing started".to_string(),
}))
}
/// Background OCR processing task
async fn process_ocr_background(
db: sea_orm::DatabaseConnection,
mistral_config: crate::config::MistralConfig,
source_id: i32,
file_path: PathBuf,
) -> Result<(), String> {
// Upload file to Mistral
let file_id = mistral::upload_to_mistral(&mistral_config, &file_path)
.await
.map_err(|e| format!("Mistral upload failed: {}", e))?;
// Get page count locally from PDF
let max_pages = get_page_count(&file_path); let max_pages = get_page_count(&file_path);
let chunk_size = state.mistral.max_pages_per_request as usize; let chunk_size = mistral_config.max_pages_per_request as usize;
let max_retries = state.mistral.max_retries; let max_retries = mistral_config.max_retries;
let mut all_results: Vec<types::DocumentAnnotation> = Vec::new(); let mut all_results: Vec<types::DocumentAnnotation> = Vec::new();
let mut failed_chunk: Option<String> = None; let mut failed_chunk: Option<String> = None;
@@ -101,7 +143,7 @@ pub async fn parse_source(
let mut chunk_result = None; let mut chunk_result = None;
while attempts <= max_retries { while attempts <= max_retries {
match mistral::ocr_pages(&state.mistral, &file_id, &pages).await { match mistral::ocr_pages(&mistral_config, &file_id, &pages).await {
Ok(annotation) => { Ok(annotation) => {
chunk_result = Some(annotation); chunk_result = Some(annotation);
break; break;
@@ -132,69 +174,59 @@ pub async fn parse_source(
} }
} }
// Fail if any chunk failed // Handle failure
if let Some(error_msg) = failed_chunk { if let Some(error_msg) = failed_chunk {
return Err(( // Update status to failed
StatusCode::INTERNAL_SERVER_ERROR, if let Ok(Some(entity)) = source::Entity::find_by_id(source_id).one(&db).await {
Json(ErrorResponse { let mut active_model: source::ActiveModel = entity.into();
error: format!("OCR parsing failed: {}", error_msg), active_model.status = Set("failed".to_string());
}), let _ = active_model.update(&db).await;
)); }
return Err(format!("OCR parsing failed: {}", error_msg));
} }
if all_results.is_empty() { if all_results.is_empty() {
return Err(( // Update status to failed
StatusCode::INTERNAL_SERVER_ERROR, if let Ok(Some(entity)) = source::Entity::find_by_id(source_id).one(&db).await {
Json(ErrorResponse { let mut active_model: source::ActiveModel = entity.into();
error: "No OCR results obtained".to_string(), active_model.status = Set("failed".to_string());
}), let _ = active_model.update(&db).await;
)); }
return Err("No OCR results obtained".to_string());
} }
// 4. Get valid biomarker names from schema // Get valid biomarker names from schema
let valid_biomarkers = schema::extract_valid_biomarker_names().map_err(|e| { let valid_biomarkers = schema::extract_valid_biomarker_names()
( .map_err(|e| format!("Failed to read schema: {}", e))?;
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("Failed to read schema: {}", e),
}),
)
})?;
tracing::info!("Loaded {} valid biomarker names from schema", valid_biomarkers.len()); tracing::info!("Loaded {} valid biomarker names from schema", valid_biomarkers.len());
// 5. Merge results with fuzzy matching // Merge results with fuzzy matching
let merged = matching::merge_results(all_results, &valid_biomarkers); let merged = matching::merge_results(all_results, &valid_biomarkers);
// 6. Save to database // Save to database
let ocr_json = serde_json::to_string(&merged).map_err(|e| { let ocr_json = serde_json::to_string(&merged)
( .map_err(|e| format!("JSON serialization failed: {}", e))?;
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse { let source_entity = source::Entity::find_by_id(source_id)
error: format!("JSON serialization failed: {}", e), .one(&db)
}), .await
) .map_err(|e| format!("Database error: {}", e))?
})?; .ok_or_else(|| "Source not found".to_string())?;
let mut active_model: source::ActiveModel = source_entity.into(); let mut active_model: source::ActiveModel = source_entity.into();
active_model.ocr_data = Set(Some(ocr_json)); active_model.ocr_data = Set(Some(ocr_json));
active_model.status = Set("parsed".to_string());
active_model.biomarker_count = Set(Some(merged.biomarkers.len() as i32));
active_model.update(&state.db).await.map_err(|e| { active_model.update(&db).await
( .map_err(|e| format!("Database update failed: {}", e))?;
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("Database update failed: {}", e),
}),
)
})?;
Ok(Json(ParseResponse { tracing::info!(
success: true, "Successfully parsed {} biomarkers for source {}",
biomarkers_count: merged.biomarkers.len(),
message: format!(
"Successfully parsed {} biomarkers for {}",
merged.biomarkers.len(), merged.biomarkers.len(),
merged.patient_name.unwrap_or_else(|| "Unknown".to_string()) source_id
), );
}))
Ok(())
} }

View File

@@ -24,6 +24,8 @@ pub struct SourceResponse {
pub file_path: String, pub file_path: String,
pub file_type: String, pub file_type: String,
pub file_size: i64, pub file_size: i64,
pub status: String,
pub biomarker_count: Option<i32>,
pub ocr_data: Option<String>, pub ocr_data: Option<String>,
pub description: Option<String>, pub description: Option<String>,
pub uploaded_at: String, pub uploaded_at: String,
@@ -55,6 +57,8 @@ pub async fn list_sources(
file_path: s.file_path, file_path: s.file_path,
file_type: s.file_type, file_type: s.file_type,
file_size: s.file_size, file_size: s.file_size,
status: s.status,
biomarker_count: s.biomarker_count,
ocr_data: s.ocr_data, ocr_data: s.ocr_data,
description: s.description, description: s.description,
uploaded_at: s.uploaded_at.to_string(), uploaded_at: s.uploaded_at.to_string(),
@@ -82,6 +86,8 @@ pub async fn get_source(
file_path: s.file_path, file_path: s.file_path,
file_type: s.file_type, file_type: s.file_type,
file_size: s.file_size, file_size: s.file_size,
status: s.status,
biomarker_count: s.biomarker_count,
ocr_data: s.ocr_data, ocr_data: s.ocr_data,
description: s.description, description: s.description,
uploaded_at: s.uploaded_at.to_string(), uploaded_at: s.uploaded_at.to_string(),
@@ -166,6 +172,8 @@ pub async fn upload_source(
file_path: Set(file_path.to_string_lossy().to_string()), file_path: Set(file_path.to_string_lossy().to_string()),
file_type: Set(content_type.clone()), file_type: Set(content_type.clone()),
file_size: Set(file_size), file_size: Set(file_size),
status: Set("pending".to_string()),
biomarker_count: Set(None),
ocr_data: Set(None), ocr_data: Set(None),
description: Set(description.clone()), description: Set(description.clone()),
uploaded_at: Set(now), uploaded_at: Set(now),
@@ -187,6 +195,8 @@ pub async fn upload_source(
file_path: inserted.file_path, file_path: inserted.file_path,
file_type: inserted.file_type, file_type: inserted.file_type,
file_size: inserted.file_size, file_size: inserted.file_size,
status: inserted.status,
biomarker_count: inserted.biomarker_count,
ocr_data: inserted.ocr_data, ocr_data: inserted.ocr_data,
description: inserted.description, description: inserted.description,
uploaded_at: inserted.uploaded_at.to_string(), uploaded_at: inserted.uploaded_at.to_string(),
@@ -256,6 +266,8 @@ pub async fn update_ocr(
file_path: updated.file_path, file_path: updated.file_path,
file_type: updated.file_type, file_type: updated.file_type,
file_size: updated.file_size, file_size: updated.file_size,
status: updated.status,
biomarker_count: updated.biomarker_count,
ocr_data: updated.ocr_data, ocr_data: updated.ocr_data,
description: updated.description, description: updated.description,
uploaded_at: updated.uploaded_at.to_string(), uploaded_at: updated.uploaded_at.to_string(),

View File

@@ -27,6 +27,14 @@ pub struct Model {
/// File size in bytes /// File size in bytes
pub file_size: i64, pub file_size: i64,
/// Parsing status: "pending", "processing", "parsed", "failed"
#[sea_orm(column_type = "Text")]
pub status: String,
/// Number of biomarkers extracted (populated after parsing)
#[sea_orm(nullable)]
pub biomarker_count: Option<i32>,
/// OCR parsed data as JSON /// OCR parsed data as JSON
#[sea_orm(column_type = "Text", nullable)] #[sea_orm(column_type = "Text", nullable)]
pub ocr_data: Option<String>, pub ocr_data: Option<String>,

View File

@@ -7,6 +7,8 @@ interface Source {
file_path: string file_path: string
file_type: string file_type: string
file_size: number file_size: number
status: string
biomarker_count: number | null
ocr_data: string | null ocr_data: string | null
description: string | null description: string | null
uploaded_at: string uploaded_at: string
@@ -217,17 +219,30 @@ export function SourcesPage() {
</div> </div>
</div> </div>
<div className="flex gap-sm items-center"> <div className="flex gap-sm items-center">
{source.ocr_data ? ( {source.status === 'parsed' ? (
<span className="status-parsed flex items-center gap-xs text-xs"> <span className="status-parsed flex items-center gap-xs text-xs">
<img src="/icons/general/icons8-checkmark-50.png" alt="Parsed" className="icon-sm" /> Parsed <img src="/icons/general/icons8-checkmark-50.png" alt="Parsed" className="icon-sm" />
{source.biomarker_count ? `${source.biomarker_count} biomarkers` : 'Parsed'}
</span> </span>
) : source.status === 'processing' ? (
<span className="status-processing text-xs text-secondary">
Processing...
</span>
) : source.status === 'failed' ? (
<button
className="btn btn-primary btn-sm"
onClick={() => handleParse(source.id)}
disabled={parsingId === source.id}
>
Retry
</button>
) : ( ) : (
<button <button
className="btn btn-primary btn-sm" className="btn btn-primary btn-sm"
onClick={() => handleParse(source.id)} onClick={() => handleParse(source.id)}
disabled={parsingId === source.id} disabled={parsingId === source.id}
> >
{parsingId === source.id ? 'Parsing...' : 'Parse'} Parse
</button> </button>
)} )}
<button <button