From fc6376abec0d913db27109fd1fcb2cf5d72f4e8d Mon Sep 17 00:00:00 2001 From: abhishekbhakat Date: Sun, 21 Dec 2025 11:06:57 +0530 Subject: [PATCH] feat: implement Mistral OCR document parsing with fuzzy matching and frontend integration --- backend/Cargo.toml | 6 + backend/ocr_schema.json | 245 +++++++++++++++++++++++++++ backend/sample.config.yaml | 10 ++ backend/seed_biomarkers.yaml | 30 ++-- backend/src/config.rs | 13 +- backend/src/handlers/mod.rs | 1 + backend/src/handlers/ocr/matching.rs | 180 ++++++++++++++++++++ backend/src/handlers/ocr/mistral.rs | 211 +++++++++++++++++++++++ backend/src/handlers/ocr/mod.rs | 200 ++++++++++++++++++++++ backend/src/handlers/ocr/schema.rs | 49 ++++++ backend/src/handlers/ocr/types.rs | 77 +++++++++ backend/src/main.rs | 25 ++- frontend/src/pages/Sources.tsx | 34 +++- 13 files changed, 1062 insertions(+), 19 deletions(-) create mode 100644 backend/ocr_schema.json create mode 100644 backend/src/handlers/ocr/matching.rs create mode 100644 backend/src/handlers/ocr/mistral.rs create mode 100644 backend/src/handlers/ocr/mod.rs create mode 100644 backend/src/handlers/ocr/schema.rs create mode 100644 backend/src/handlers/ocr/types.rs diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 6f2ea85..82c9116 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -47,3 +47,9 @@ regex = "1" # CLI argh = "0.1" +reqwest = { version = "0.12.26", features = ["multipart", "json"] } +serde_json = "1.0.145" + +# PDF parsing for page count +lopdf = "0.36" +strsim = "0.11" diff --git a/backend/ocr_schema.json b/backend/ocr_schema.json new file mode 100644 index 0000000..f74dabf --- /dev/null +++ b/backend/ocr_schema.json @@ -0,0 +1,245 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "name": "LabReport", + "description": "Extract biomarker data from a medical lab report", + "type": "object", + "properties": { + "patient_name": { + "type": "string", + "description": "Full name of the patient" + }, + "patient_age": { + "type": "integer", + "description": "Age of the patient in years" + }, + "patient_gender": { + "type": "string", + "enum": [ + "male", + "female", + "other" + ], + "description": "Gender of the patient" + }, + "lab_name": { + "type": "string", + "description": "Name of the laboratory" + }, + "test_date": { + "type": "string", + "description": "Date when the sample was collected (YYYY-MM-DD format if possible)" + }, + "report_id": { + "type": "string", + "description": "Report ID, barcode, or reference number" + }, + "biomarkers": { + "type": "array", + "description": "List of biomarker test results", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Name of the biomarker/test", + "enum": [ + "ARSENIC", + "CADMIUM", + "MERCURY", + "LEAD", + "CHROMIUM", + "BARIUM", + "COBALT", + "CAESIUM", + "THALLIUM", + "URANIUM", + "STRONTIUM", + "ANTIMONY", + "TIN", + "MOLYBDENUM", + "SILVER", + "VANADIUM", + "BERYLLIUM", + "BISMUTH", + "SELENIUM", + "ALUMINIUM", + "NICKEL", + "MANGANESE", + "GLYCOSYLATED HEMOGLOBIN (HbA1c)", + "AVERAGE BLOOD GLUCOSE (ABG)", + "FASTING BLOOD SUGAR (GLUCOSE)", + "INSULIN FASTING", + "FRUCTOSAMINE", + "BLOOD KETONE (D3HB)", + "ERYTHROCYTE SEDIMENTATION RATE (ESR)", + "ANTI CCP (ACCP)", + "ANTI NUCLEAR ANTIBODIES (ANA)", + "HEMOGLOBIN", + "HEMATOCRIT (PCV)", + "TOTAL RED BLOOD CELL COUNT (RBC)", + "MEAN CORPUSCULAR VOLUME (MCV)", + "MEAN CORPUSCULAR HEMOGLOBIN (MCH)", + "MEAN CORP. HEMO. CONC (MCHC)", + "RED CELL DISTRIBUTION WIDTH - SD (RDW-SD)", + "RED CELL DISTRIBUTION WIDTH (RDW-CV)", + "TOTAL LEUCOCYTE COUNT (WBC)", + "NEUTROPHILS PERCENTAGE", + "LYMPHOCYTES PERCENTAGE", + "MONOCYTES PERCENTAGE", + "EOSINOPHILS PERCENTAGE", + "BASOPHILS PERCENTAGE", + "IMMATURE GRANULOCYTE PERCENTAGE (IG%)", + "NUCLEATED RED BLOOD CELLS %", + "NEUTROPHILS ABSOLUTE COUNT", + "LYMPHOCYTES ABSOLUTE COUNT", + "MONOCYTES - ABSOLUTE COUNT", + "BASOPHILS ABSOLUTE COUNT", + "EOSINOPHILS ABSOLUTE COUNT", + "IMMATURE GRANULOCYTES (IG)", + "NUCLEATED RED BLOOD CELLS", + "PLATELET COUNT", + "MEAN PLATELET VOLUME (MPV)", + "PLATELET DISTRIBUTION WIDTH (PDW)", + "PLATELET TO LARGE CELL RATIO (PLCR)", + "PLATELETCRIT (PCT)", + "VITAMIN A", + "VITAMIN E", + "VITAMIN K", + "VITAMIN B1 (THIAMIN)", + "VITAMIN B2 (RIBOFLAVIN)", + "VITAMIN B3 (NIACIN/NICOTINIC ACID)", + "VITAMIN B5 (PANTOTHENIC ACID)", + "VITAMIN B6 (PYRIDOXAL-5-PHOSPHATE)", + "VITAMIN B7 (BIOTIN)", + "VITAMIN B9 (FOLIC ACID)", + "VITAMIN B12 (COBALAMIN)", + "VITAMIN D TOTAL", + "VITAMIN D2", + "VITAMIN D3", + "CORTISOL", + "CORTICOSTERONE", + "ANDROSTENEDIONE", + "ESTRADIOL", + "TESTOSTERONE", + "PROGESTERONE", + "17-HYDROXYPROGESTERONE", + "DEHYDROEPIANDROSTERONE (DHEA)", + "DHEA - SULPHATE (DHEAS)", + "DEOXYCORTISOL", + "ALPHA-1-ANTITRYPSIN (AAT)", + "HOMOCYSTEINE", + "TROPONIN I", + "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)", + "LIPOPROTEIN (A) [Lp(a)]", + "LIPOPROTEIN-ASSOCIATED PHOSPHOLIPASE A2 (LP-PLA2)", + "CYSTATIN C", + "BLOOD UREA NITROGEN (BUN)", + "UREA (CALCULATED)", + "CREATININE - SERUM", + "UREA / SR.CREATININE RATIO", + "BUN / SR.CREATININE RATIO", + "CALCIUM", + "URIC ACID", + "ESTIMATED GLOMERULAR FILTRATION RATE (eGFR)", + "TOTAL CHOLESTEROL", + "HDL CHOLESTEROL - DIRECT", + "LDL CHOLESTEROL - DIRECT", + "TRIGLYCERIDES", + "VLDL CHOLESTEROL", + "NON-HDL CHOLESTEROL", + "TC / HDL CHOLESTEROL RATIO", + "LDL / HDL RATIO", + "HDL / LDL RATIO", + "TRIG / HDL RATIO", + "APOLIPOPROTEIN - A1 (APO-A1)", + "APOLIPOPROTEIN - B (APO-B)", + "APO B / APO A1 RATIO", + "IRON", + "TOTAL IRON BINDING CAPACITY (TIBC)", + "% TRANSFERRIN SATURATION", + "FERRITIN", + "UNSAT. IRON-BINDING CAPACITY (UIBC)", + "ALKALINE PHOSPHATASE", + "BILIRUBIN - TOTAL", + "BILIRUBIN - DIRECT", + "BILIRUBIN (INDIRECT)", + "GAMMA GLUTAMYL TRANSFERASE (GGT)", + "ASPARTATE AMINOTRANSFERASE (SGOT)", + "ALANINE TRANSAMINASE (SGPT)", + "SGOT / SGPT RATIO", + "PROTEIN - TOTAL", + "ALBUMIN - SERUM", + "SERUM GLOBULIN", + "SERUM ALB/GLOBULIN RATIO", + "SODIUM", + "POTASSIUM", + "CHLORIDE", + "MAGNESIUM", + "TOTAL TRIIODOTHYRONINE (T3)", + "TOTAL THYROXINE (T4)", + "TSH ULTRASENSITIVE", + "SERUM COPPER", + "SERUM ZINC", + "AMYLASE", + "LIPASE", + "URINARY MICROALBUMIN", + "CREATININE - URINE", + "URI. ALBUMIN/CREATININE RATIO", + "URINE COLOUR", + "URINE APPEARANCE", + "URINE SPECIFIC GRAVITY", + "URINE PH", + "URINARY PROTEIN", + "URINARY GLUCOSE", + "URINE KETONE", + "URINARY BILIRUBIN", + "UROBILINOGEN", + "BILE SALT", + "BILE PIGMENT", + "URINE BLOOD", + "NITRITE", + "LEUCOCYTE ESTERASE", + "MUCUS", + "URINE RBC", + "URINARY LEUCOCYTES (PUS CELLS)", + "EPITHELIAL CELLS", + "CASTS", + "CRYSTALS", + "BACTERIA", + "YEAST", + "PARASITE", + "WEIGHT", + "HEIGHT", + "BODY MASS INDEX (BMI)", + "HEART RATE", + "BLOOD PRESSURE SYSTOLIC", + "BLOOD PRESSURE DIASTOLIC", + "OXYGEN SATURATION (SpO2)", + "BODY TEMPERATURE", + "STEPS", + "CALORIES BURNED" + ] + }, + "value": { + "type": "number", + "description": "Observed/measured value" + }, + "value_string": { + "type": "string", + "description": "Value as string if non-numeric (e.g., 'Negative', 'Trace', '> 65')" + }, + "unit": { + "type": "string", + "description": "Unit of measurement" + } + }, + "required": [ + "name" + ] + } + } + }, + "required": [ + "biomarkers" + ] +} \ No newline at end of file diff --git a/backend/sample.config.yaml b/backend/sample.config.yaml index 22344bc..9f213ac 100644 --- a/backend/sample.config.yaml +++ b/backend/sample.config.yaml @@ -10,6 +10,7 @@ paths: database: "./data/zhealth.db" logs: "./logs" uploads: "./data/uploads" + max_upload_mb: 50 # Maximum file upload size in MB logging: level: "info" # Options: trace | debug | info | warn | error @@ -29,3 +30,12 @@ ai: provider: "gemini" # Options: gemini | openai | anthropic model: "gemini-3-flash-preview" api_key: "${AI_API_KEY}" + +# Mistral OCR for document parsing +mistral: + api_key: "${MISTRAL_API_KEY}" + ocr_model: "mistral-ocr-latest" + max_pages_per_request: 8 + max_retries: 2 # Max retry attempts per chunk + timeout_secs: 120 # Request timeout in seconds + diff --git a/backend/seed_biomarkers.yaml b/backend/seed_biomarkers.yaml index 95c10f3..ae1eee3 100644 --- a/backend/seed_biomarkers.yaml +++ b/backend/seed_biomarkers.yaml @@ -214,7 +214,7 @@ biomarkers: # ============================================================================ # DIABETES / METABOLIC - Scale-based interpretations # ============================================================================ - - name: "HbA1c" + - name: "GLYCOSYLATED HEMOGLOBIN (HbA1c)" test_category: DIABETES category: metabolic unit: "%" @@ -339,7 +339,7 @@ biomarkers: min: 36.0 max: 44.0 - - name: "TOTAL RBC" + - name: "TOTAL RED BLOOD CELL COUNT (RBC)" test_category: HEMOGRAM category: blood unit: "10^6/µL" @@ -614,7 +614,7 @@ biomarkers: min: 0.13 max: 1.19 - - name: "VITAMIN B1/THIAMIN" + - name: "VITAMIN B1 (THIAMIN)" test_category: VITAMIN category: vitamins unit: "ng/mL" @@ -623,7 +623,7 @@ biomarkers: min: 0.5 max: 4.0 - - name: "VITAMIN B2/RIBOFLAVIN" + - name: "VITAMIN B2 (RIBOFLAVIN)" test_category: VITAMIN category: vitamins unit: "ng/mL" @@ -632,7 +632,7 @@ biomarkers: min: 1.6 max: 68.2 - - name: "VITAMIN B3/NICOTINIC ACID" + - name: "VITAMIN B3 (NIACIN/NICOTINIC ACID)" test_category: VITAMIN category: vitamins unit: "ng/mL" @@ -640,7 +640,7 @@ biomarkers: reference: max: 5.0 - - name: "VITAMIN B5/PANTOTHENIC" + - name: "VITAMIN B5 (PANTOTHENIC ACID)" test_category: VITAMIN category: vitamins unit: "ng/mL" @@ -649,7 +649,7 @@ biomarkers: min: 11.0 max: 150.0 - - name: "VITAMIN B6/P5P" + - name: "VITAMIN B6 (PYRIDOXAL-5-PHOSPHATE)" test_category: VITAMIN category: vitamins unit: "ng/mL" @@ -658,7 +658,7 @@ biomarkers: min: 5.0 max: 50.0 - - name: "VITAMIN B7/BIOTIN" + - name: "VITAMIN B7 (BIOTIN)" test_category: VITAMIN category: vitamins unit: "ng/mL" @@ -667,7 +667,7 @@ biomarkers: min: 0.2 max: 3.0 - - name: "VITAMIN B9/FOLIC ACID" + - name: "VITAMIN B9 (FOLIC ACID)" test_category: VITAMIN category: vitamins unit: "ng/mL" @@ -676,7 +676,7 @@ biomarkers: min: 0.2 max: 20.0 - - name: "VITAMIN B-12" + - name: "VITAMIN B12 (COBALAMIN)" test_category: VITAMIN category: vitamins unit: "pg/mL" @@ -951,7 +951,7 @@ biomarkers: - { min: 4, max: 10, label: "Moderate risk of future heart attack" } - { min: 10, label: "Elevated risk of future heart attack" } - - name: "HS-CRP" + - name: "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)" test_category: CARDIAC category: cardiac unit: "mg/L" @@ -970,7 +970,7 @@ biomarkers: reference: max: 30.0 - - name: "LP-PLA2" + - name: "LIPOPROTEIN-ASSOCIATED PHOSPHOLIPASE A2 (LP-PLA2)" test_category: CARDIAC category: cardiac unit: "nmol/min/mL" @@ -1062,7 +1062,7 @@ biomarkers: min: 2.6 max: 6.0 - - name: "eGFR" + - name: "ESTIMATED GLOMERULAR FILTRATION RATE (eGFR)" test_category: RENAL category: renal unit: "mL/min/1.73m²" @@ -1733,7 +1733,7 @@ biomarkers: category: body unit: "cm" - - name: "BMI" + - name: "BODY MASS INDEX (BMI)" test_category: BODY category: body unit: "kg/m²" @@ -1773,7 +1773,7 @@ biomarkers: - { min: 80, max: 89, label: "High Blood Pressure Stage 1" } - { min: 90, label: "High Blood Pressure Stage 2" } - - name: "SPO2" + - name: "OXYGEN SATURATION (SpO2)" test_category: VITALS category: vitals unit: "%" diff --git a/backend/src/config.rs b/backend/src/config.rs index dc1b126..1f9c292 100644 --- a/backend/src/config.rs +++ b/backend/src/config.rs @@ -12,6 +12,7 @@ pub struct Config { pub auth: AuthConfig, pub admin: AdminConfig, pub ai: AiConfig, + pub mistral: MistralConfig, } #[derive(Debug, Deserialize)] @@ -20,11 +21,12 @@ pub struct ServerConfig { pub port: u16, } -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct PathsConfig { pub database: String, pub logs: String, pub uploads: String, + pub max_upload_mb: u32, } #[derive(Debug, Deserialize)] @@ -53,6 +55,15 @@ pub struct AiConfig { pub api_key: String, } +#[derive(Debug, Deserialize, Clone)] +pub struct MistralConfig { + pub api_key: String, + pub ocr_model: String, + pub max_pages_per_request: u32, + pub max_retries: u32, + pub timeout_secs: u64, +} + impl Config { /// Load configuration from a YAML file. pub fn load>(path: P) -> anyhow::Result { diff --git a/backend/src/handlers/mod.rs b/backend/src/handlers/mod.rs index 01640a1..1cb43df 100644 --- a/backend/src/handlers/mod.rs +++ b/backend/src/handlers/mod.rs @@ -5,5 +5,6 @@ pub mod biomarkers; pub mod categories; pub mod diets; pub mod entries; +pub mod ocr; pub mod sources; pub mod users; diff --git a/backend/src/handlers/ocr/matching.rs b/backend/src/handlers/ocr/matching.rs new file mode 100644 index 0000000..c0a2602 --- /dev/null +++ b/backend/src/handlers/ocr/matching.rs @@ -0,0 +1,180 @@ +//! Biomarker matching and merging logic. + +use std::collections::{HashMap, HashSet}; +use strsim::jaro_winkler; + +use super::types::{Biomarker, DocumentAnnotation, OcrResult}; + +/// Fuzzy matching threshold (0.0 - 1.0). +/// Names with Jaro-Winkler similarity >= this value are considered a match. +const FUZZY_THRESHOLD: f64 = 0.90; + +/// Find a matching biomarker name from the valid set. +/// Returns the canonical name if found (exact, alias, or fuzzy match). +/// +/// Matching order: +/// 1. Exact match on full name +/// 2. Extract parenthetical alias from INPUT (e.g., `(HS-CRP)` from `HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)`) +/// 3. Extract parenthetical alias from SCHEMA (e.g., `HS-CRP` matches `HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)`) +/// 4. Fuzzy match with Jaro-Winkler (threshold 0.90) +fn find_matching_biomarker(name: &str, valid_biomarkers: &HashSet) -> Option { + let name_upper = name.to_uppercase(); + + // 1. Exact match first (fast path) + if valid_biomarkers.contains(&name_upper) { + return Some(name_upper); + } + + // 2. Try extracting parenthetical alias from INPUT + if let Some(alias) = extract_parenthetical_alias(&name_upper) { + if valid_biomarkers.contains(&alias) { + tracing::debug!( + "Alias matched '{}' -> '{}' (extracted from parentheses in input)", + name, alias + ); + return Some(alias); + } + } + + // 3. Try matching input against aliases in SCHEMA + // This handles input "HS-CRP" matching schema "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)" + for valid in valid_biomarkers { + if let Some(alias) = extract_parenthetical_alias(valid) { + if alias == name_upper { + tracing::debug!( + "Reverse alias matched '{}' -> '{}' (input is alias in schema)", + name, valid + ); + return Some(valid.clone()); + } + } + } + + // 4. Fuzzy match with threshold + valid_biomarkers.iter() + .map(|valid| (valid, jaro_winkler(&name_upper, valid))) + .filter(|(_, score)| *score >= FUZZY_THRESHOLD) + .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap()) + .map(|(matched_name, score)| { + tracing::debug!( + "Fuzzy matched '{}' -> '{}' (score: {:.3})", + name, matched_name, score + ); + matched_name.clone() + }) +} + +/// Extract alias from parentheses or brackets at the end of a name. +/// Examples: +/// - "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)" -> "HS-CRP" +/// - "EST. GLOMERULAR FILTRATION RATE (eGFR)" -> "EGFR" +/// - "LIPOPROTEIN (A) [LP(A)]" -> None (nested parens too complex) +fn extract_parenthetical_alias(name: &str) -> Option { + let name = name.trim(); + + // Look for trailing (ALIAS) pattern + if let Some(start) = name.rfind('(') { + if name.ends_with(')') { + let alias = &name[start + 1..name.len() - 1]; + // Only use if it looks like an abbreviation (mostly uppercase, short) + if alias.len() >= 2 && alias.len() <= 15 && !alias.contains(' ') { + return Some(alias.to_uppercase()); + } + } + } + + None +} + +/// Merge multiple OCR results into one, filtering to only known biomarkers. +/// Uses fuzzy matching to handle name variations. +pub fn merge_results(results: Vec, valid_biomarkers: &HashSet) -> OcrResult { + let mut merged = OcrResult { + patient_name: None, + patient_age: None, + patient_gender: None, + lab_name: None, + test_date: None, + biomarkers: Vec::new(), + }; + + // Track biomarkers by canonical name, prefer ones with actual values + let mut biomarker_map: HashMap = HashMap::new(); + let mut skipped_count = 0; + let mut fuzzy_matched_count = 0; + + for result in results { + // Take first non-null metadata + if merged.patient_name.is_none() && result.patient_name.is_some() { + merged.patient_name = result.patient_name; + } + if merged.patient_age.is_none() && result.patient_age.is_some() { + merged.patient_age = result.patient_age; + } + if merged.patient_gender.is_none() && result.patient_gender.is_some() { + merged.patient_gender = result.patient_gender; + } + if merged.lab_name.is_none() && result.lab_name.is_some() { + merged.lab_name = result.lab_name; + } + if merged.test_date.is_none() && result.test_date.is_some() { + merged.test_date = result.test_date; + } + + // Merge biomarkers with fuzzy matching + if let Some(biomarkers) = result.biomarkers { + for mut bm in biomarkers { + let original_name = bm.name.clone(); + + // Try to find a matching canonical name + let canonical_name = match find_matching_biomarker(&bm.name, valid_biomarkers) { + Some(matched) => { + if matched != bm.name.to_uppercase() { + fuzzy_matched_count += 1; + } + // Update the biomarker name to canonical form + bm.name = matched.clone(); + matched + } + None => { + tracing::debug!("Skipping unknown biomarker: {}", original_name); + skipped_count += 1; + continue; + } + }; + + let has_real_value = bm.value.is_some() || + bm.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false); + + if let Some(existing) = biomarker_map.get(&canonical_name) { + let existing_has_real_value = existing.value.is_some() || + existing.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false); + + // Replace only if current has real value and existing doesn't + if has_real_value && !existing_has_real_value { + biomarker_map.insert(canonical_name, bm); + } + } else { + biomarker_map.insert(canonical_name, bm); + } + } + } + } + + if skipped_count > 0 { + tracing::info!("Skipped {} unknown biomarkers not in schema", skipped_count); + } + if fuzzy_matched_count > 0 { + tracing::info!("Fuzzy matched {} biomarkers to canonical names", fuzzy_matched_count); + } + + // Collect biomarkers from map, filtering out "Not Provided" only entries + merged.biomarkers = biomarker_map.into_values() + .filter(|bm| { + bm.value.is_some() || + bm.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false) + }) + .collect(); + + merged +} diff --git a/backend/src/handlers/ocr/mistral.rs b/backend/src/handlers/ocr/mistral.rs new file mode 100644 index 0000000..513f01e --- /dev/null +++ b/backend/src/handlers/ocr/mistral.rs @@ -0,0 +1,211 @@ +//! Mistral API integration for OCR. + +use reqwest::multipart::{Form, Part}; +use serde_json::{json, Value}; +use std::path::PathBuf; +use std::time::Duration; +use tokio::fs; + +use crate::config::MistralConfig; +use super::types::{Biomarker, DocumentAnnotation, MistralFileResponse, MistralOcrResponse}; +use super::schema::strip_descriptions; + +/// Upload a file to Mistral and return the file ID. +pub async fn upload_to_mistral(config: &MistralConfig, file_path: &PathBuf) -> Result { + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(config.timeout_secs)) + .build() + .map_err(|e| format!("Failed to create HTTP client: {}", e))?; + + let file_bytes = fs::read(file_path) + .await + .map_err(|e| format!("Failed to read file: {}", e))?; + + let file_name = file_path + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("document.pdf") + .to_string(); + + let part = Part::bytes(file_bytes) + .file_name(file_name) + .mime_str("application/pdf") + .map_err(|e| format!("MIME error: {}", e))?; + + let form = Form::new() + .text("purpose", "ocr") + .part("file", part); + + let response = client + .post("https://api.mistral.ai/v1/files") + .header("Authorization", format!("Bearer {}", config.api_key)) + .multipart(form) + .send() + .await + .map_err(|e| format!("HTTP request failed: {}", e))?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + return Err(format!("Mistral upload failed: {}", error_text)); + } + + let response_text = response.text().await + .map_err(|e| format!("Failed to read response: {}", e))?; + + tracing::info!("Mistral file upload response: {}", response_text); + + let result: MistralFileResponse = serde_json::from_str(&response_text) + .map_err(|e| format!("Failed to parse response: {} - raw: {}", e, response_text))?; + + tracing::info!("Parsed file upload: id={}, num_pages={:?}", result.id, result.num_pages); + + Ok(result.id) +} + +/// Process OCR for specific pages of an uploaded document. +pub async fn ocr_pages( + config: &MistralConfig, + file_id: &str, + pages: &[usize], +) -> Result { + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(config.timeout_secs)) + .build() + .map_err(|e| format!("Failed to create HTTP client: {}", e))?; + + // Load the complete schema from file + let schema_content = std::fs::read_to_string("ocr_schema.json") + .map_err(|e| format!("Failed to read ocr_schema.json: {}", e))?; + let mut schema: Value = serde_json::from_str(&schema_content) + .map_err(|e| format!("Failed to parse ocr_schema.json: {}", e))?; + + // Clean the schema - remove meta-fields that Mistral echoes back + if let Some(obj) = schema.as_object_mut() { + obj.remove("$schema"); + obj.remove("name"); + obj.remove("description"); + } + strip_descriptions(&mut schema); + + let body = json!({ + "model": config.ocr_model, + "document": { + "type": "file", + "file_id": file_id + }, + "pages": pages, + "document_annotation_format": { + "type": "json_schema", + "json_schema": { + "name": "LabReport", + "schema": schema + } + } + }); + + let response = client + .post("https://api.mistral.ai/v1/ocr") + .header("Authorization", format!("Bearer {}", config.api_key)) + .header("Content-Type", "application/json") + .json(&body) + .send() + .await + .map_err(|e| format!("OCR request failed: {}", e))?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + return Err(format!("OCR failed: {}", error_text)); + } + + let result: MistralOcrResponse = response + .json() + .await + .map_err(|e| format!("Failed to parse OCR response: {}", e))?; + + let annotation_str = result + .document_annotation + .ok_or_else(|| "No document annotation in response".to_string())?; + + tracing::debug!("Raw annotation from Mistral: {}", &annotation_str); + + // Mistral returns data wrapped in "properties" - extract it + let raw_json: Value = serde_json::from_str(&annotation_str) + .map_err(|e| format!("Failed to parse raw JSON: {}", e))?; + + let data_json = if let Some(props) = raw_json.get("properties") { + props.clone() + } else { + raw_json + }; + + // Check if this is a schema-only response (no actual data) + if let Some(biomarkers) = data_json.get("biomarkers") { + if biomarkers.get("type").is_some() && biomarkers.get("items").is_some() { + tracing::warn!("Skipping schema-only response (no data for these pages)"); + return Ok(DocumentAnnotation { + patient_name: None, + patient_age: None, + patient_gender: None, + lab_name: None, + test_date: None, + biomarkers: Some(vec![]), + }); + } + } + + let annotation = parse_annotation(&data_json)?; + + tracing::info!("Parsed annotation: patient={:?}, biomarkers={}", + annotation.patient_name, + annotation.biomarkers.as_ref().map(|b| b.len()).unwrap_or(0)); + + Ok(annotation) +} + +/// Parse annotation handling various Mistral response formats. +fn parse_annotation(data: &Value) -> Result { + let patient_name = data.get("patient_name").and_then(|v| v.as_str()).map(|s| s.to_string()); + let patient_age = data.get("patient_age").and_then(|v| v.as_i64()).map(|n| n as i32); + let patient_gender = data.get("patient_gender").and_then(|v| v.as_str()).map(|s| s.to_string()); + let lab_name = data.get("lab_name").and_then(|v| v.as_str()).map(|s| s.to_string()); + let test_date = data.get("test_date").and_then(|v| v.as_str()).map(|s| s.to_string()); + + // Parse biomarkers - handle nested "properties" format + let biomarkers = if let Some(bm_array) = data.get("biomarkers").and_then(|v| v.as_array()) { + let mut parsed: Vec = vec![]; + for item in bm_array { + // Try direct format first + if let Some(name) = item.get("name").and_then(|v| v.as_str()) { + parsed.push(Biomarker { + name: name.to_string(), + value: item.get("value").and_then(|v| v.as_f64()), + value_string: item.get("value_string").and_then(|v| v.as_str()).map(|s| s.to_string()), + unit: item.get("unit").and_then(|v| v.as_str()).map(|s| s.to_string()), + }); + } + // Try nested "properties" format + else if let Some(props) = item.get("properties") { + if let Some(name) = props.get("name").and_then(|v| v.as_str()) { + parsed.push(Biomarker { + name: name.to_string(), + value: props.get("value").and_then(|v| v.as_f64()), + value_string: props.get("value_string").and_then(|v| v.as_str()).map(|s| s.to_string()), + unit: props.get("unit").and_then(|v| v.as_str()).map(|s| s.to_string()), + }); + } + } + } + Some(parsed) + } else { + Some(vec![]) + }; + + Ok(DocumentAnnotation { + patient_name, + patient_age, + patient_gender, + lab_name, + test_date, + biomarkers, + }) +} diff --git a/backend/src/handlers/ocr/mod.rs b/backend/src/handlers/ocr/mod.rs new file mode 100644 index 0000000..79507b0 --- /dev/null +++ b/backend/src/handlers/ocr/mod.rs @@ -0,0 +1,200 @@ +//! OCR API handlers - Mistral OCR integration for document parsing. + +mod matching; +mod mistral; +mod schema; +mod types; + +use std::path::PathBuf; + +use axum::{ + extract::{Path, State}, + http::StatusCode, + Json, +}; +use sea_orm::{ActiveModelTrait, EntityTrait, Set}; + +use crate::models::bio::source; + +// Re-export public types +pub use types::{ErrorResponse, OcrState, ParseResponse}; + +/// Get page count from a local file. +/// For PDFs, uses lopdf to read the actual page count. +/// For other file types (images, etc.), returns 1. +fn get_page_count(file_path: &PathBuf) -> usize { + let extension = file_path.extension() + .and_then(|e| e.to_str()) + .unwrap_or("") + .to_lowercase(); + + if extension == "pdf" { + match lopdf::Document::load(file_path) { + Ok(doc) => { + let count = doc.get_pages().len(); + tracing::info!("PDF page count (local): {}", count); + count + } + Err(e) => { + tracing::warn!("Failed to read PDF page count: {}, defaulting to 1", e); + 1 + } + } + } else { + tracing::info!("Non-PDF file, treating as 1 page"); + 1 + } +} + +/// POST /api/sources/:id/parse - Parse a source document using Mistral OCR. +pub async fn parse_source( + State(state): State, + Path(id): Path, +) -> Result, (StatusCode, Json)> { + // 1. Get source from database + let source_entity = source::Entity::find_by_id(id) + .one(&state.db) + .await + .map_err(|e| { + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(ErrorResponse { + error: format!("Database error: {}", e), + }), + ) + })? + .ok_or_else(|| { + ( + StatusCode::NOT_FOUND, + Json(ErrorResponse { + error: "Source not found".to_string(), + }), + ) + })?; + + let file_path = PathBuf::from(&source_entity.file_path); + + // 2. Upload file to Mistral + let file_id = mistral::upload_to_mistral(&state.mistral, &file_path).await.map_err(|e| { + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(ErrorResponse { + error: format!("Mistral upload failed: {}", e), + }), + ) + })?; + + // 3. Get page count locally from PDF + let max_pages = get_page_count(&file_path); + let chunk_size = state.mistral.max_pages_per_request as usize; + let max_retries = state.mistral.max_retries; + let mut all_results: Vec = Vec::new(); + let mut failed_chunk: Option = None; + + for start_page in (0..max_pages).step_by(chunk_size) { + let pages: Vec = (start_page..std::cmp::min(start_page + chunk_size, max_pages)).collect(); + + tracing::info!("Processing OCR for pages {:?}", pages); + + // Retry loop for this chunk + let mut attempts = 0; + let mut chunk_result = None; + + while attempts <= max_retries { + match mistral::ocr_pages(&state.mistral, &file_id, &pages).await { + Ok(annotation) => { + chunk_result = Some(annotation); + break; + } + Err(e) => { + if e.contains("out of range") || e.contains("no pages") || e.contains("Invalid page") { + tracing::info!("Reached end of document at pages {:?}", pages); + break; + } + + attempts += 1; + if attempts <= max_retries { + tracing::warn!("OCR chunk error (pages {:?}), attempt {}/{}: {}", pages, attempts, max_retries + 1, e); + } else { + tracing::error!("OCR chunk failed after {} attempts (pages {:?}): {}", max_retries + 1, pages, e); + failed_chunk = Some(format!("Pages {:?}: {}", pages, e)); + } + } + } + } + + if let Some(annotation) = chunk_result { + all_results.push(annotation); + } else if failed_chunk.is_some() { + break; + } else { + break; + } + } + + // Fail if any chunk failed + if let Some(error_msg) = failed_chunk { + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(ErrorResponse { + error: format!("OCR parsing failed: {}", error_msg), + }), + )); + } + + if all_results.is_empty() { + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(ErrorResponse { + error: "No OCR results obtained".to_string(), + }), + )); + } + + // 4. Get valid biomarker names from schema + let valid_biomarkers = schema::extract_valid_biomarker_names().map_err(|e| { + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(ErrorResponse { + error: format!("Failed to read schema: {}", e), + }), + ) + })?; + + tracing::info!("Loaded {} valid biomarker names from schema", valid_biomarkers.len()); + + // 5. Merge results with fuzzy matching + let merged = matching::merge_results(all_results, &valid_biomarkers); + + // 6. Save to database + let ocr_json = serde_json::to_string(&merged).map_err(|e| { + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(ErrorResponse { + error: format!("JSON serialization failed: {}", e), + }), + ) + })?; + + let mut active_model: source::ActiveModel = source_entity.into(); + active_model.ocr_data = Set(Some(ocr_json)); + + active_model.update(&state.db).await.map_err(|e| { + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(ErrorResponse { + error: format!("Database update failed: {}", e), + }), + ) + })?; + + Ok(Json(ParseResponse { + success: true, + biomarkers_count: merged.biomarkers.len(), + message: format!( + "Successfully parsed {} biomarkers for {}", + merged.biomarkers.len(), + merged.patient_name.unwrap_or_else(|| "Unknown".to_string()) + ), + })) +} diff --git a/backend/src/handlers/ocr/schema.rs b/backend/src/handlers/ocr/schema.rs new file mode 100644 index 0000000..1fe7f73 --- /dev/null +++ b/backend/src/handlers/ocr/schema.rs @@ -0,0 +1,49 @@ +//! Schema handling utilities. + +use serde_json::Value; +use std::collections::HashSet; + +/// Extract valid biomarker names from the ocr_schema.json enum. +pub fn extract_valid_biomarker_names() -> Result, String> { + let schema_content = std::fs::read_to_string("ocr_schema.json") + .map_err(|e| format!("Failed to read ocr_schema.json: {}", e))?; + let schema: Value = serde_json::from_str(&schema_content) + .map_err(|e| format!("Failed to parse ocr_schema.json: {}", e))?; + + // Navigate to: properties.biomarkers.items.properties.name.enum + let names = schema + .get("properties") + .and_then(|p| p.get("biomarkers")) + .and_then(|b| b.get("items")) + .and_then(|i| i.get("properties")) + .and_then(|p| p.get("name")) + .and_then(|n| n.get("enum")) + .and_then(|e| e.as_array()) + .ok_or_else(|| "Could not find biomarker name enum in schema".to_string())?; + + let valid_names: HashSet = names + .iter() + .filter_map(|v| v.as_str()) + .map(|s| s.to_uppercase()) + .collect(); + + Ok(valid_names) +} + +/// Recursively remove "description" fields from a JSON value. +pub fn strip_descriptions(value: &mut Value) { + match value { + Value::Object(map) => { + map.remove("description"); + for (_, v) in map.iter_mut() { + strip_descriptions(v); + } + } + Value::Array(arr) => { + for v in arr.iter_mut() { + strip_descriptions(v); + } + } + _ => {} + } +} diff --git a/backend/src/handlers/ocr/types.rs b/backend/src/handlers/ocr/types.rs new file mode 100644 index 0000000..40dd126 --- /dev/null +++ b/backend/src/handlers/ocr/types.rs @@ -0,0 +1,77 @@ +//! Type definitions for OCR module. + +use sea_orm::DatabaseConnection; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; + +use crate::config::MistralConfig; + +/// State for OCR handlers. +#[derive(Clone)] +pub struct OcrState { + pub db: DatabaseConnection, + pub uploads_path: PathBuf, + pub mistral: MistralConfig, +} + +/// Response for parse endpoint. +#[derive(Serialize)] +pub struct ParseResponse { + pub success: bool, + pub biomarkers_count: usize, + pub message: String, +} + +/// Error response. +#[derive(Serialize)] +pub struct ErrorResponse { + pub error: String, +} + +/// Mistral file upload response. +#[derive(Deserialize)] +pub struct MistralFileResponse { + pub id: String, + #[allow(dead_code)] + pub bytes: i64, + pub num_pages: Option, +} + +/// Mistral OCR response. +#[derive(Deserialize)] +pub struct MistralOcrResponse { + pub document_annotation: Option, + #[allow(dead_code)] + pub pages: Option>, +} + +/// Extracted biomarker from OCR. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Biomarker { + pub name: String, + pub value: Option, + pub value_string: Option, + pub unit: Option, +} + +/// Merged OCR result. +#[derive(Debug, Serialize, Deserialize)] +pub struct OcrResult { + pub patient_name: Option, + pub patient_age: Option, + pub patient_gender: Option, + pub lab_name: Option, + pub test_date: Option, + pub biomarkers: Vec, +} + +/// Document annotation from Mistral. +#[derive(Debug, Deserialize)] +pub struct DocumentAnnotation { + pub patient_name: Option, + pub patient_age: Option, + pub patient_gender: Option, + pub lab_name: Option, + pub test_date: Option, + pub biomarkers: Option>, +} diff --git a/backend/src/main.rs b/backend/src/main.rs index d3dd95b..62b121e 100644 --- a/backend/src/main.rs +++ b/backend/src/main.rs @@ -156,13 +156,26 @@ fn create_router(db: DatabaseConnection, config: &config::Config) -> Router { .route("/api/sources/{id}", get(handlers::sources::get_source) .delete(handlers::sources::delete_source)) .route("/api/sources/{id}/ocr", put(handlers::sources::update_ocr)) + .layer(axum::extract::DefaultBodyLimit::max(config.paths.max_upload_mb as usize * 1024 * 1024)) .route_layer(middleware::from_fn(require_auth)) .with_state(sources_state); + // OCR routes (need Mistral config) + let ocr_state = handlers::ocr::OcrState { + db: db.clone(), + uploads_path: PathBuf::from(&config.paths.uploads), + mistral: config.mistral.clone(), + }; + let ocr_routes = Router::new() + .route("/api/sources/{id}/parse", post(handlers::ocr::parse_source)) + .route_layer(middleware::from_fn(require_auth)) + .with_state(ocr_state); + Router::new() .merge(public_routes) .merge(protected_routes) .merge(sources_routes) + .merge(ocr_routes) .layer(auth_layer) .with_state(db) } @@ -185,10 +198,18 @@ async fn require_auth( } fn init_logging(config: &config::Config) { - let log_level = config.logging.level.parse().unwrap_or(tracing::Level::INFO); + // Build filter: use configured level for our code, but restrict sqlx/sea_orm + let filter_str = format!( + "{},sqlx=warn,sea_orm=warn", + config.logging.level + ); + + let filter = tracing_subscriber::filter::EnvFilter::try_new(&filter_str) + .unwrap_or_else(|_| tracing_subscriber::filter::EnvFilter::new("info,sqlx=warn,sea_orm=warn")); + tracing_subscriber::registry() .with(tracing_subscriber::fmt::layer()) - .with(tracing_subscriber::filter::LevelFilter::from_level(log_level)) + .with(filter) .init(); } diff --git a/frontend/src/pages/Sources.tsx b/frontend/src/pages/Sources.tsx index 30b3d04..e843196 100644 --- a/frontend/src/pages/Sources.tsx +++ b/frontend/src/pages/Sources.tsx @@ -19,6 +19,7 @@ export function SourcesPage() { const [error, setError] = useState(null) const [dragOver, setDragOver] = useState(false) const [deleteConfirmId, setDeleteConfirmId] = useState(null) + const [parsingId, setParsingId] = useState(null) const fileInputRef = useRef(null) // Fetch sources on mount @@ -98,6 +99,31 @@ export function SourcesPage() { } } + const handleParse = async (id: number) => { + setParsingId(id) + setError(null) + try { + const res = await fetch(`/api/sources/${id}/parse`, { + method: 'POST', + credentials: 'include', + }) + if (res.ok) { + const data = await res.json() + // Refresh sources to show updated status + fetchSources() + console.log('Parsed:', data) + } else { + const err = await res.json() + setError(err.error || 'Parse failed') + } + } catch (e) { + console.error('Failed to parse:', e) + setError('Failed to parse document') + } finally { + setParsingId(null) + } + } + const formatFileSize = (bytes: number) => { if (bytes < 1024) return `${bytes} B` if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB` @@ -196,7 +222,13 @@ export function SourcesPage() { Parsed Parsed ) : ( - Pending + )}