feat: implement OCR parsing status tracking and biomarker count display
This commit is contained in:
@@ -47,6 +47,7 @@ fn get_page_count(file_path: &PathBuf) -> usize {
|
||||
}
|
||||
|
||||
/// POST /api/sources/:id/parse - Parse a source document using Mistral OCR.
|
||||
/// Returns immediately with "processing" status; OCR runs in background.
|
||||
pub async fn parse_source(
|
||||
State(state): State<OcrState>,
|
||||
Path(id): Path<i32>,
|
||||
@@ -72,22 +73,63 @@ pub async fn parse_source(
|
||||
)
|
||||
})?;
|
||||
|
||||
// Check if already being processed
|
||||
if source_entity.status == "processing" {
|
||||
return Ok(Json(ParseResponse {
|
||||
success: true,
|
||||
biomarkers_count: 0,
|
||||
message: "Already processing".to_string(),
|
||||
}));
|
||||
}
|
||||
|
||||
let file_path = PathBuf::from(&source_entity.file_path);
|
||||
|
||||
// 2. Upload file to Mistral
|
||||
let file_id = mistral::upload_to_mistral(&state.mistral, &file_path).await.map_err(|e| {
|
||||
// 2. Set status to "processing" immediately
|
||||
let mut active_model: source::ActiveModel = source_entity.into();
|
||||
active_model.status = Set("processing".to_string());
|
||||
active_model.update(&state.db).await.map_err(|e| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("Mistral upload failed: {}", e),
|
||||
error: format!("Database update failed: {}", e),
|
||||
}),
|
||||
)
|
||||
})?;
|
||||
|
||||
// 3. Get page count locally from PDF
|
||||
// 3. Spawn background task for OCR processing
|
||||
let db = state.db.clone();
|
||||
let mistral_config = state.mistral.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = process_ocr_background(db, mistral_config, id, file_path).await {
|
||||
tracing::error!("Background OCR failed for source {}: {}", id, e);
|
||||
}
|
||||
});
|
||||
|
||||
// 4. Return immediately
|
||||
Ok(Json(ParseResponse {
|
||||
success: true,
|
||||
biomarkers_count: 0,
|
||||
message: "Processing started".to_string(),
|
||||
}))
|
||||
}
|
||||
|
||||
/// Background OCR processing task
|
||||
async fn process_ocr_background(
|
||||
db: sea_orm::DatabaseConnection,
|
||||
mistral_config: crate::config::MistralConfig,
|
||||
source_id: i32,
|
||||
file_path: PathBuf,
|
||||
) -> Result<(), String> {
|
||||
// Upload file to Mistral
|
||||
let file_id = mistral::upload_to_mistral(&mistral_config, &file_path)
|
||||
.await
|
||||
.map_err(|e| format!("Mistral upload failed: {}", e))?;
|
||||
|
||||
// Get page count locally from PDF
|
||||
let max_pages = get_page_count(&file_path);
|
||||
let chunk_size = state.mistral.max_pages_per_request as usize;
|
||||
let max_retries = state.mistral.max_retries;
|
||||
let chunk_size = mistral_config.max_pages_per_request as usize;
|
||||
let max_retries = mistral_config.max_retries;
|
||||
let mut all_results: Vec<types::DocumentAnnotation> = Vec::new();
|
||||
let mut failed_chunk: Option<String> = None;
|
||||
|
||||
@@ -101,7 +143,7 @@ pub async fn parse_source(
|
||||
let mut chunk_result = None;
|
||||
|
||||
while attempts <= max_retries {
|
||||
match mistral::ocr_pages(&state.mistral, &file_id, &pages).await {
|
||||
match mistral::ocr_pages(&mistral_config, &file_id, &pages).await {
|
||||
Ok(annotation) => {
|
||||
chunk_result = Some(annotation);
|
||||
break;
|
||||
@@ -132,69 +174,59 @@ pub async fn parse_source(
|
||||
}
|
||||
}
|
||||
|
||||
// Fail if any chunk failed
|
||||
// Handle failure
|
||||
if let Some(error_msg) = failed_chunk {
|
||||
return Err((
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("OCR parsing failed: {}", error_msg),
|
||||
}),
|
||||
));
|
||||
// Update status to failed
|
||||
if let Ok(Some(entity)) = source::Entity::find_by_id(source_id).one(&db).await {
|
||||
let mut active_model: source::ActiveModel = entity.into();
|
||||
active_model.status = Set("failed".to_string());
|
||||
let _ = active_model.update(&db).await;
|
||||
}
|
||||
return Err(format!("OCR parsing failed: {}", error_msg));
|
||||
}
|
||||
|
||||
if all_results.is_empty() {
|
||||
return Err((
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: "No OCR results obtained".to_string(),
|
||||
}),
|
||||
));
|
||||
// Update status to failed
|
||||
if let Ok(Some(entity)) = source::Entity::find_by_id(source_id).one(&db).await {
|
||||
let mut active_model: source::ActiveModel = entity.into();
|
||||
active_model.status = Set("failed".to_string());
|
||||
let _ = active_model.update(&db).await;
|
||||
}
|
||||
return Err("No OCR results obtained".to_string());
|
||||
}
|
||||
|
||||
// 4. Get valid biomarker names from schema
|
||||
let valid_biomarkers = schema::extract_valid_biomarker_names().map_err(|e| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("Failed to read schema: {}", e),
|
||||
}),
|
||||
)
|
||||
})?;
|
||||
// Get valid biomarker names from schema
|
||||
let valid_biomarkers = schema::extract_valid_biomarker_names()
|
||||
.map_err(|e| format!("Failed to read schema: {}", e))?;
|
||||
|
||||
tracing::info!("Loaded {} valid biomarker names from schema", valid_biomarkers.len());
|
||||
|
||||
// 5. Merge results with fuzzy matching
|
||||
// Merge results with fuzzy matching
|
||||
let merged = matching::merge_results(all_results, &valid_biomarkers);
|
||||
|
||||
// 6. Save to database
|
||||
let ocr_json = serde_json::to_string(&merged).map_err(|e| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("JSON serialization failed: {}", e),
|
||||
}),
|
||||
)
|
||||
})?;
|
||||
// Save to database
|
||||
let ocr_json = serde_json::to_string(&merged)
|
||||
.map_err(|e| format!("JSON serialization failed: {}", e))?;
|
||||
|
||||
let source_entity = source::Entity::find_by_id(source_id)
|
||||
.one(&db)
|
||||
.await
|
||||
.map_err(|e| format!("Database error: {}", e))?
|
||||
.ok_or_else(|| "Source not found".to_string())?;
|
||||
|
||||
let mut active_model: source::ActiveModel = source_entity.into();
|
||||
active_model.ocr_data = Set(Some(ocr_json));
|
||||
active_model.status = Set("parsed".to_string());
|
||||
active_model.biomarker_count = Set(Some(merged.biomarkers.len() as i32));
|
||||
|
||||
active_model.update(&state.db).await.map_err(|e| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("Database update failed: {}", e),
|
||||
}),
|
||||
)
|
||||
})?;
|
||||
active_model.update(&db).await
|
||||
.map_err(|e| format!("Database update failed: {}", e))?;
|
||||
|
||||
Ok(Json(ParseResponse {
|
||||
success: true,
|
||||
biomarkers_count: merged.biomarkers.len(),
|
||||
message: format!(
|
||||
"Successfully parsed {} biomarkers for {}",
|
||||
merged.biomarkers.len(),
|
||||
merged.patient_name.unwrap_or_else(|| "Unknown".to_string())
|
||||
),
|
||||
}))
|
||||
tracing::info!(
|
||||
"Successfully parsed {} biomarkers for source {}",
|
||||
merged.biomarkers.len(),
|
||||
source_id
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -24,6 +24,8 @@ pub struct SourceResponse {
|
||||
pub file_path: String,
|
||||
pub file_type: String,
|
||||
pub file_size: i64,
|
||||
pub status: String,
|
||||
pub biomarker_count: Option<i32>,
|
||||
pub ocr_data: Option<String>,
|
||||
pub description: Option<String>,
|
||||
pub uploaded_at: String,
|
||||
@@ -55,6 +57,8 @@ pub async fn list_sources(
|
||||
file_path: s.file_path,
|
||||
file_type: s.file_type,
|
||||
file_size: s.file_size,
|
||||
status: s.status,
|
||||
biomarker_count: s.biomarker_count,
|
||||
ocr_data: s.ocr_data,
|
||||
description: s.description,
|
||||
uploaded_at: s.uploaded_at.to_string(),
|
||||
@@ -82,6 +86,8 @@ pub async fn get_source(
|
||||
file_path: s.file_path,
|
||||
file_type: s.file_type,
|
||||
file_size: s.file_size,
|
||||
status: s.status,
|
||||
biomarker_count: s.biomarker_count,
|
||||
ocr_data: s.ocr_data,
|
||||
description: s.description,
|
||||
uploaded_at: s.uploaded_at.to_string(),
|
||||
@@ -166,6 +172,8 @@ pub async fn upload_source(
|
||||
file_path: Set(file_path.to_string_lossy().to_string()),
|
||||
file_type: Set(content_type.clone()),
|
||||
file_size: Set(file_size),
|
||||
status: Set("pending".to_string()),
|
||||
biomarker_count: Set(None),
|
||||
ocr_data: Set(None),
|
||||
description: Set(description.clone()),
|
||||
uploaded_at: Set(now),
|
||||
@@ -187,6 +195,8 @@ pub async fn upload_source(
|
||||
file_path: inserted.file_path,
|
||||
file_type: inserted.file_type,
|
||||
file_size: inserted.file_size,
|
||||
status: inserted.status,
|
||||
biomarker_count: inserted.biomarker_count,
|
||||
ocr_data: inserted.ocr_data,
|
||||
description: inserted.description,
|
||||
uploaded_at: inserted.uploaded_at.to_string(),
|
||||
@@ -256,6 +266,8 @@ pub async fn update_ocr(
|
||||
file_path: updated.file_path,
|
||||
file_type: updated.file_type,
|
||||
file_size: updated.file_size,
|
||||
status: updated.status,
|
||||
biomarker_count: updated.biomarker_count,
|
||||
ocr_data: updated.ocr_data,
|
||||
description: updated.description,
|
||||
uploaded_at: updated.uploaded_at.to_string(),
|
||||
|
||||
@@ -27,6 +27,14 @@ pub struct Model {
|
||||
/// File size in bytes
|
||||
pub file_size: i64,
|
||||
|
||||
/// Parsing status: "pending", "processing", "parsed", "failed"
|
||||
#[sea_orm(column_type = "Text")]
|
||||
pub status: String,
|
||||
|
||||
/// Number of biomarkers extracted (populated after parsing)
|
||||
#[sea_orm(nullable)]
|
||||
pub biomarker_count: Option<i32>,
|
||||
|
||||
/// OCR parsed data as JSON
|
||||
#[sea_orm(column_type = "Text", nullable)]
|
||||
pub ocr_data: Option<String>,
|
||||
|
||||
Reference in New Issue
Block a user