feat: implement Mistral OCR document parsing with fuzzy matching and frontend integration

2025-12-21 11:06:57 +05:30
parent c8b4beafff
commit fc6376abec
13 changed files with 1062 additions and 19 deletions
--- a/backend/Cargo.toml
+++ b/backend/Cargo.toml
@@ -47,3 +47,9 @@ regex = "1"

 # CLI
 argh = "0.1"
+reqwest = { version = "0.12.26", features = ["multipart", "json"] }
+serde_json = "1.0.145"
+
+# PDF parsing for page count
+lopdf = "0.36"
+strsim = "0.11"
--- a/backend/ocr_schema.json
+++ b/backend/ocr_schema.json
@@ -0,0 +1,245 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "name": "LabReport",
+    "description": "Extract biomarker data from a medical lab report",
+    "type": "object",
+    "properties": {
+        "patient_name": {
+            "type": "string",
+            "description": "Full name of the patient"
+        },
+        "patient_age": {
+            "type": "integer",
+            "description": "Age of the patient in years"
+        },
+        "patient_gender": {
+            "type": "string",
+            "enum": [
+                "male",
+                "female",
+                "other"
+            ],
+            "description": "Gender of the patient"
+        },
+        "lab_name": {
+            "type": "string",
+            "description": "Name of the laboratory"
+        },
+        "test_date": {
+            "type": "string",
+            "description": "Date when the sample was collected (YYYY-MM-DD format if possible)"
+        },
+        "report_id": {
+            "type": "string",
+            "description": "Report ID, barcode, or reference number"
+        },
+        "biomarkers": {
+            "type": "array",
+            "description": "List of biomarker test results",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "name": {
+                        "type": "string",
+                        "description": "Name of the biomarker/test",
+                        "enum": [
+                            "ARSENIC",
+                            "CADMIUM",
+                            "MERCURY",
+                            "LEAD",
+                            "CHROMIUM",
+                            "BARIUM",
+                            "COBALT",
+                            "CAESIUM",
+                            "THALLIUM",
+                            "URANIUM",
+                            "STRONTIUM",
+                            "ANTIMONY",
+                            "TIN",
+                            "MOLYBDENUM",
+                            "SILVER",
+                            "VANADIUM",
+                            "BERYLLIUM",
+                            "BISMUTH",
+                            "SELENIUM",
+                            "ALUMINIUM",
+                            "NICKEL",
+                            "MANGANESE",
+                            "GLYCOSYLATED HEMOGLOBIN (HbA1c)",
+                            "AVERAGE BLOOD GLUCOSE (ABG)",
+                            "FASTING BLOOD SUGAR (GLUCOSE)",
+                            "INSULIN FASTING",
+                            "FRUCTOSAMINE",
+                            "BLOOD KETONE (D3HB)",
+                            "ERYTHROCYTE SEDIMENTATION RATE (ESR)",
+                            "ANTI CCP (ACCP)",
+                            "ANTI NUCLEAR ANTIBODIES (ANA)",
+                            "HEMOGLOBIN",
+                            "HEMATOCRIT (PCV)",
+                            "TOTAL RED BLOOD CELL COUNT (RBC)",
+                            "MEAN CORPUSCULAR VOLUME (MCV)",
+                            "MEAN CORPUSCULAR HEMOGLOBIN (MCH)",
+                            "MEAN CORP. HEMO. CONC (MCHC)",
+                            "RED CELL DISTRIBUTION WIDTH - SD (RDW-SD)",
+                            "RED CELL DISTRIBUTION WIDTH (RDW-CV)",
+                            "TOTAL LEUCOCYTE COUNT (WBC)",
+                            "NEUTROPHILS PERCENTAGE",
+                            "LYMPHOCYTES PERCENTAGE",
+                            "MONOCYTES PERCENTAGE",
+                            "EOSINOPHILS PERCENTAGE",
+                            "BASOPHILS PERCENTAGE",
+                            "IMMATURE GRANULOCYTE PERCENTAGE (IG%)",
+                            "NUCLEATED RED BLOOD CELLS %",
+                            "NEUTROPHILS ABSOLUTE COUNT",
+                            "LYMPHOCYTES ABSOLUTE COUNT",
+                            "MONOCYTES - ABSOLUTE COUNT",
+                            "BASOPHILS ABSOLUTE COUNT",
+                            "EOSINOPHILS ABSOLUTE COUNT",
+                            "IMMATURE GRANULOCYTES (IG)",
+                            "NUCLEATED RED BLOOD CELLS",
+                            "PLATELET COUNT",
+                            "MEAN PLATELET VOLUME (MPV)",
+                            "PLATELET DISTRIBUTION WIDTH (PDW)",
+                            "PLATELET TO LARGE CELL RATIO (PLCR)",
+                            "PLATELETCRIT (PCT)",
+                            "VITAMIN A",
+                            "VITAMIN E",
+                            "VITAMIN K",
+                            "VITAMIN B1 (THIAMIN)",
+                            "VITAMIN B2 (RIBOFLAVIN)",
+                            "VITAMIN B3 (NIACIN/NICOTINIC ACID)",
+                            "VITAMIN B5 (PANTOTHENIC ACID)",
+                            "VITAMIN B6 (PYRIDOXAL-5-PHOSPHATE)",
+                            "VITAMIN B7 (BIOTIN)",
+                            "VITAMIN B9 (FOLIC ACID)",
+                            "VITAMIN B12 (COBALAMIN)",
+                            "VITAMIN D TOTAL",
+                            "VITAMIN D2",
+                            "VITAMIN D3",
+                            "CORTISOL",
+                            "CORTICOSTERONE",
+                            "ANDROSTENEDIONE",
+                            "ESTRADIOL",
+                            "TESTOSTERONE",
+                            "PROGESTERONE",
+                            "17-HYDROXYPROGESTERONE",
+                            "DEHYDROEPIANDROSTERONE (DHEA)",
+                            "DHEA - SULPHATE (DHEAS)",
+                            "DEOXYCORTISOL",
+                            "ALPHA-1-ANTITRYPSIN (AAT)",
+                            "HOMOCYSTEINE",
+                            "TROPONIN I",
+                            "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)",
+                            "LIPOPROTEIN (A) [Lp(a)]",
+                            "LIPOPROTEIN-ASSOCIATED PHOSPHOLIPASE A2 (LP-PLA2)",
+                            "CYSTATIN C",
+                            "BLOOD UREA NITROGEN (BUN)",
+                            "UREA (CALCULATED)",
+                            "CREATININE - SERUM",
+                            "UREA / SR.CREATININE RATIO",
+                            "BUN / SR.CREATININE RATIO",
+                            "CALCIUM",
+                            "URIC ACID",
+                            "ESTIMATED GLOMERULAR FILTRATION RATE (eGFR)",
+                            "TOTAL CHOLESTEROL",
+                            "HDL CHOLESTEROL - DIRECT",
+                            "LDL CHOLESTEROL - DIRECT",
+                            "TRIGLYCERIDES",
+                            "VLDL CHOLESTEROL",
+                            "NON-HDL CHOLESTEROL",
+                            "TC / HDL CHOLESTEROL RATIO",
+                            "LDL / HDL RATIO",
+                            "HDL / LDL RATIO",
+                            "TRIG / HDL RATIO",
+                            "APOLIPOPROTEIN - A1 (APO-A1)",
+                            "APOLIPOPROTEIN - B (APO-B)",
+                            "APO B / APO A1 RATIO",
+                            "IRON",
+                            "TOTAL IRON BINDING CAPACITY (TIBC)",
+                            "% TRANSFERRIN SATURATION",
+                            "FERRITIN",
+                            "UNSAT. IRON-BINDING CAPACITY (UIBC)",
+                            "ALKALINE PHOSPHATASE",
+                            "BILIRUBIN - TOTAL",
+                            "BILIRUBIN - DIRECT",
+                            "BILIRUBIN (INDIRECT)",
+                            "GAMMA GLUTAMYL TRANSFERASE (GGT)",
+                            "ASPARTATE AMINOTRANSFERASE (SGOT)",
+                            "ALANINE TRANSAMINASE (SGPT)",
+                            "SGOT / SGPT RATIO",
+                            "PROTEIN - TOTAL",
+                            "ALBUMIN - SERUM",
+                            "SERUM GLOBULIN",
+                            "SERUM ALB/GLOBULIN RATIO",
+                            "SODIUM",
+                            "POTASSIUM",
+                            "CHLORIDE",
+                            "MAGNESIUM",
+                            "TOTAL TRIIODOTHYRONINE (T3)",
+                            "TOTAL THYROXINE (T4)",
+                            "TSH ULTRASENSITIVE",
+                            "SERUM COPPER",
+                            "SERUM ZINC",
+                            "AMYLASE",
+                            "LIPASE",
+                            "URINARY MICROALBUMIN",
+                            "CREATININE - URINE",
+                            "URI. ALBUMIN/CREATININE RATIO",
+                            "URINE COLOUR",
+                            "URINE APPEARANCE",
+                            "URINE SPECIFIC GRAVITY",
+                            "URINE PH",
+                            "URINARY PROTEIN",
+                            "URINARY GLUCOSE",
+                            "URINE KETONE",
+                            "URINARY BILIRUBIN",
+                            "UROBILINOGEN",
+                            "BILE SALT",
+                            "BILE PIGMENT",
+                            "URINE BLOOD",
+                            "NITRITE",
+                            "LEUCOCYTE ESTERASE",
+                            "MUCUS",
+                            "URINE RBC",
+                            "URINARY LEUCOCYTES (PUS CELLS)",
+                            "EPITHELIAL CELLS",
+                            "CASTS",
+                            "CRYSTALS",
+                            "BACTERIA",
+                            "YEAST",
+                            "PARASITE",
+                            "WEIGHT",
+                            "HEIGHT",
+                            "BODY MASS INDEX (BMI)",
+                            "HEART RATE",
+                            "BLOOD PRESSURE SYSTOLIC",
+                            "BLOOD PRESSURE DIASTOLIC",
+                            "OXYGEN SATURATION (SpO2)",
+                            "BODY TEMPERATURE",
+                            "STEPS",
+                            "CALORIES BURNED"
+                        ]
+                    },
+                    "value": {
+                        "type": "number",
+                        "description": "Observed/measured value"
+                    },
+                    "value_string": {
+                        "type": "string",
+                        "description": "Value as string if non-numeric (e.g., 'Negative', 'Trace', '> 65')"
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "Unit of measurement"
+                    }
+                },
+                "required": [
+                    "name"
+                ]
+            }
+        }
+    },
+    "required": [
+        "biomarkers"
+    ]
+}
--- a/backend/sample.config.yaml
+++ b/backend/sample.config.yaml
@@ -10,6 +10,7 @@ paths:
  database: "./data/zhealth.db"
  logs: "./logs"
  uploads: "./data/uploads"
+  max_upload_mb: 50  # Maximum file upload size in MB

 logging:
  level: "info"  # Options: trace | debug | info | warn | error
@@ -29,3 +30,12 @@ ai:
  provider: "gemini"  # Options: gemini | openai | anthropic
  model: "gemini-3-flash-preview"
  api_key: "${AI_API_KEY}"
+
+# Mistral OCR for document parsing
+mistral:
+  api_key: "${MISTRAL_API_KEY}"
+  ocr_model: "mistral-ocr-latest"
+  max_pages_per_request: 8
+  max_retries: 2        # Max retry attempts per chunk
+  timeout_secs: 120     # Request timeout in seconds
+
--- a/backend/seed_biomarkers.yaml
+++ b/backend/seed_biomarkers.yaml
@@ -214,7 +214,7 @@ biomarkers:
  # ============================================================================
  # DIABETES / METABOLIC - Scale-based interpretations
  # ============================================================================
-  - name: "HbA1c"
+  - name: "GLYCOSYLATED HEMOGLOBIN (HbA1c)"
    test_category: DIABETES
    category: metabolic
    unit: "%"
@@ -339,7 +339,7 @@ biomarkers:
        min: 36.0
        max: 44.0

-  - name: "TOTAL RBC"
+  - name: "TOTAL RED BLOOD CELL COUNT (RBC)"
    test_category: HEMOGRAM
    category: blood
    unit: "10^6/µL"
@@ -614,7 +614,7 @@ biomarkers:
      min: 0.13
      max: 1.19

-  - name: "VITAMIN B1/THIAMIN"
+  - name: "VITAMIN B1 (THIAMIN)"
    test_category: VITAMIN
    category: vitamins
    unit: "ng/mL"
@@ -623,7 +623,7 @@ biomarkers:
      min: 0.5
      max: 4.0

-  - name: "VITAMIN B2/RIBOFLAVIN"
+  - name: "VITAMIN B2 (RIBOFLAVIN)"
    test_category: VITAMIN
    category: vitamins
    unit: "ng/mL"
@@ -632,7 +632,7 @@ biomarkers:
      min: 1.6
      max: 68.2

-  - name: "VITAMIN B3/NICOTINIC ACID"
+  - name: "VITAMIN B3 (NIACIN/NICOTINIC ACID)"
    test_category: VITAMIN
    category: vitamins
    unit: "ng/mL"
@@ -640,7 +640,7 @@ biomarkers:
    reference:
      max: 5.0

-  - name: "VITAMIN B5/PANTOTHENIC"
+  - name: "VITAMIN B5 (PANTOTHENIC ACID)"
    test_category: VITAMIN
    category: vitamins
    unit: "ng/mL"
@@ -649,7 +649,7 @@ biomarkers:
      min: 11.0
      max: 150.0

-  - name: "VITAMIN B6/P5P"
+  - name: "VITAMIN B6 (PYRIDOXAL-5-PHOSPHATE)"
    test_category: VITAMIN
    category: vitamins
    unit: "ng/mL"
@@ -658,7 +658,7 @@ biomarkers:
      min: 5.0
      max: 50.0

-  - name: "VITAMIN B7/BIOTIN"
+  - name: "VITAMIN B7 (BIOTIN)"
    test_category: VITAMIN
    category: vitamins
    unit: "ng/mL"
@@ -667,7 +667,7 @@ biomarkers:
      min: 0.2
      max: 3.0

-  - name: "VITAMIN B9/FOLIC ACID"
+  - name: "VITAMIN B9 (FOLIC ACID)"
    test_category: VITAMIN
    category: vitamins
    unit: "ng/mL"
@@ -676,7 +676,7 @@ biomarkers:
      min: 0.2
      max: 20.0

-  - name: "VITAMIN B-12"
+  - name: "VITAMIN B12 (COBALAMIN)"
    test_category: VITAMIN
    category: vitamins
    unit: "pg/mL"
@@ -951,7 +951,7 @@ biomarkers:
        - { min: 4, max: 10, label: "Moderate risk of future heart attack" }
        - { min: 10, label: "Elevated risk of future heart attack" }

-  - name: "HS-CRP"
+  - name: "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)"
    test_category: CARDIAC
    category: cardiac
    unit: "mg/L"
@@ -970,7 +970,7 @@ biomarkers:
    reference:
      max: 30.0

-  - name: "LP-PLA2"
+  - name: "LIPOPROTEIN-ASSOCIATED PHOSPHOLIPASE A2 (LP-PLA2)"
    test_category: CARDIAC
    category: cardiac
    unit: "nmol/min/mL"
@@ -1062,7 +1062,7 @@ biomarkers:
        min: 2.6
        max: 6.0

-  - name: "eGFR"
+  - name: "ESTIMATED GLOMERULAR FILTRATION RATE (eGFR)"
    test_category: RENAL
    category: renal
    unit: "mL/min/1.73m²"
@@ -1733,7 +1733,7 @@ biomarkers:
    category: body
    unit: "cm"

-  - name: "BMI"
+  - name: "BODY MASS INDEX (BMI)"
    test_category: BODY
    category: body
    unit: "kg/m²"
@@ -1773,7 +1773,7 @@ biomarkers:
      - { min: 80, max: 89, label: "High Blood Pressure Stage 1" }
      - { min: 90, label: "High Blood Pressure Stage 2" }

-  - name: "SPO2"
+  - name: "OXYGEN SATURATION (SpO2)"
    test_category: VITALS
    category: vitals
    unit: "%"
--- a/backend/src/config.rs
+++ b/backend/src/config.rs
@@ -12,6 +12,7 @@ pub struct Config {
    pub auth: AuthConfig,
    pub admin: AdminConfig,
    pub ai: AiConfig,
+    pub mistral: MistralConfig,
 }

 #[derive(Debug, Deserialize)]
@@ -20,11 +21,12 @@ pub struct ServerConfig {
    pub port: u16,
 }

-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct PathsConfig {
    pub database: String,
    pub logs: String,
    pub uploads: String,
+    pub max_upload_mb: u32,
 }

 #[derive(Debug, Deserialize)]
@@ -53,6 +55,15 @@ pub struct AiConfig {
    pub api_key: String,
 }

+#[derive(Debug, Deserialize, Clone)]
+pub struct MistralConfig {
+    pub api_key: String,
+    pub ocr_model: String,
+    pub max_pages_per_request: u32,
+    pub max_retries: u32,
+    pub timeout_secs: u64,
+}
+
 impl Config {
    /// Load configuration from a YAML file.
    pub fn load<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
--- a/backend/src/handlers/mod.rs
+++ b/backend/src/handlers/mod.rs
@@ -5,5 +5,6 @@ pub mod biomarkers;
 pub mod categories;
 pub mod diets;
 pub mod entries;
+pub mod ocr;
 pub mod sources;
 pub mod users;
--- a/backend/src/handlers/ocr/matching.rs
+++ b/backend/src/handlers/ocr/matching.rs
@@ -0,0 +1,180 @@
+//! Biomarker matching and merging logic.
+
+use std::collections::{HashMap, HashSet};
+use strsim::jaro_winkler;
+
+use super::types::{Biomarker, DocumentAnnotation, OcrResult};
+
+/// Fuzzy matching threshold (0.0 - 1.0).
+/// Names with Jaro-Winkler similarity >= this value are considered a match.
+const FUZZY_THRESHOLD: f64 = 0.90;
+
+/// Find a matching biomarker name from the valid set.
+/// Returns the canonical name if found (exact, alias, or fuzzy match).
+/// 
+/// Matching order:
+/// 1. Exact match on full name
+/// 2. Extract parenthetical alias from INPUT (e.g., `(HS-CRP)` from `HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)`)
+/// 3. Extract parenthetical alias from SCHEMA (e.g., `HS-CRP` matches `HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)`)
+/// 4. Fuzzy match with Jaro-Winkler (threshold 0.90)
+fn find_matching_biomarker(name: &str, valid_biomarkers: &HashSet<String>) -> Option<String> {
+    let name_upper = name.to_uppercase();
+    
+    // 1. Exact match first (fast path)
+    if valid_biomarkers.contains(&name_upper) {
+        return Some(name_upper);
+    }
+    
+    // 2. Try extracting parenthetical alias from INPUT
+    if let Some(alias) = extract_parenthetical_alias(&name_upper) {
+        if valid_biomarkers.contains(&alias) {
+            tracing::debug!(
+                "Alias matched '{}' -> '{}' (extracted from parentheses in input)",
+                name, alias
+            );
+            return Some(alias);
+        }
+    }
+
+    // 3. Try matching input against aliases in SCHEMA
+    // This handles input "HS-CRP" matching schema "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)"
+    for valid in valid_biomarkers {
+        if let Some(alias) = extract_parenthetical_alias(valid) {
+            if alias == name_upper {
+                tracing::debug!(
+                    "Reverse alias matched '{}' -> '{}' (input is alias in schema)",
+                    name, valid
+                );
+                return Some(valid.clone());
+            }
+        }
+    }
+    
+    // 4. Fuzzy match with threshold
+    valid_biomarkers.iter()
+        .map(|valid| (valid, jaro_winkler(&name_upper, valid)))
+        .filter(|(_, score)| *score >= FUZZY_THRESHOLD)
+        .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
+        .map(|(matched_name, score)| {
+            tracing::debug!(
+                "Fuzzy matched '{}' -> '{}' (score: {:.3})",
+                name, matched_name, score
+            );
+            matched_name.clone()
+        })
+}
+
+/// Extract alias from parentheses or brackets at the end of a name.
+/// Examples:
+/// - "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)" -> "HS-CRP"
+/// - "EST. GLOMERULAR FILTRATION RATE (eGFR)" -> "EGFR"
+/// - "LIPOPROTEIN (A) [LP(A)]" -> None (nested parens too complex)
+fn extract_parenthetical_alias(name: &str) -> Option<String> {
+    let name = name.trim();
+    
+    // Look for trailing (ALIAS) pattern
+    if let Some(start) = name.rfind('(') {
+        if name.ends_with(')') {
+            let alias = &name[start + 1..name.len() - 1];
+            // Only use if it looks like an abbreviation (mostly uppercase, short)
+            if alias.len() >= 2 && alias.len() <= 15 && !alias.contains(' ') {
+                return Some(alias.to_uppercase());
+            }
+        }
+    }
+    
+    None
+}
+
+/// Merge multiple OCR results into one, filtering to only known biomarkers.
+/// Uses fuzzy matching to handle name variations.
+pub fn merge_results(results: Vec<DocumentAnnotation>, valid_biomarkers: &HashSet<String>) -> OcrResult {
+    let mut merged = OcrResult {
+        patient_name: None,
+        patient_age: None,
+        patient_gender: None,
+        lab_name: None,
+        test_date: None,
+        biomarkers: Vec::new(),
+    };
+
+    // Track biomarkers by canonical name, prefer ones with actual values
+    let mut biomarker_map: HashMap<String, Biomarker> = HashMap::new();
+    let mut skipped_count = 0;
+    let mut fuzzy_matched_count = 0;
+
+    for result in results {
+        // Take first non-null metadata
+        if merged.patient_name.is_none() && result.patient_name.is_some() {
+            merged.patient_name = result.patient_name;
+        }
+        if merged.patient_age.is_none() && result.patient_age.is_some() {
+            merged.patient_age = result.patient_age;
+        }
+        if merged.patient_gender.is_none() && result.patient_gender.is_some() {
+            merged.patient_gender = result.patient_gender;
+        }
+        if merged.lab_name.is_none() && result.lab_name.is_some() {
+            merged.lab_name = result.lab_name;
+        }
+        if merged.test_date.is_none() && result.test_date.is_some() {
+            merged.test_date = result.test_date;
+        }
+
+        // Merge biomarkers with fuzzy matching
+        if let Some(biomarkers) = result.biomarkers {
+            for mut bm in biomarkers {
+                let original_name = bm.name.clone();
+                
+                // Try to find a matching canonical name
+                let canonical_name = match find_matching_biomarker(&bm.name, valid_biomarkers) {
+                    Some(matched) => {
+                        if matched != bm.name.to_uppercase() {
+                            fuzzy_matched_count += 1;
+                        }
+                        // Update the biomarker name to canonical form
+                        bm.name = matched.clone();
+                        matched
+                    }
+                    None => {
+                        tracing::debug!("Skipping unknown biomarker: {}", original_name);
+                        skipped_count += 1;
+                        continue;
+                    }
+                };
+                
+                let has_real_value = bm.value.is_some() || 
+                    bm.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false);
+
+                if let Some(existing) = biomarker_map.get(&canonical_name) {
+                    let existing_has_real_value = existing.value.is_some() || 
+                        existing.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false);
+                    
+                    // Replace only if current has real value and existing doesn't
+                    if has_real_value && !existing_has_real_value {
+                        biomarker_map.insert(canonical_name, bm);
+                    }
+                } else {
+                    biomarker_map.insert(canonical_name, bm);
+                }
+            }
+        }
+    }
+
+    if skipped_count > 0 {
+        tracing::info!("Skipped {} unknown biomarkers not in schema", skipped_count);
+    }
+    if fuzzy_matched_count > 0 {
+        tracing::info!("Fuzzy matched {} biomarkers to canonical names", fuzzy_matched_count);
+    }
+
+    // Collect biomarkers from map, filtering out "Not Provided" only entries
+    merged.biomarkers = biomarker_map.into_values()
+        .filter(|bm| {
+            bm.value.is_some() || 
+            bm.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false)
+        })
+        .collect();
+
+    merged
+}
--- a/backend/src/handlers/ocr/mistral.rs
+++ b/backend/src/handlers/ocr/mistral.rs
@@ -0,0 +1,211 @@
+//! Mistral API integration for OCR.
+
+use reqwest::multipart::{Form, Part};
+use serde_json::{json, Value};
+use std::path::PathBuf;
+use std::time::Duration;
+use tokio::fs;
+
+use crate::config::MistralConfig;
+use super::types::{Biomarker, DocumentAnnotation, MistralFileResponse, MistralOcrResponse};
+use super::schema::strip_descriptions;
+
+/// Upload a file to Mistral and return the file ID.
+pub async fn upload_to_mistral(config: &MistralConfig, file_path: &PathBuf) -> Result<String, String> {
+    let client = reqwest::Client::builder()
+        .timeout(Duration::from_secs(config.timeout_secs))
+        .build()
+        .map_err(|e| format!("Failed to create HTTP client: {}", e))?;
+
+    let file_bytes = fs::read(file_path)
+        .await
+        .map_err(|e| format!("Failed to read file: {}", e))?;
+
+    let file_name = file_path
+        .file_name()
+        .and_then(|n| n.to_str())
+        .unwrap_or("document.pdf")
+        .to_string();
+
+    let part = Part::bytes(file_bytes)
+        .file_name(file_name)
+        .mime_str("application/pdf")
+        .map_err(|e| format!("MIME error: {}", e))?;
+
+    let form = Form::new()
+        .text("purpose", "ocr")
+        .part("file", part);
+
+    let response = client
+        .post("https://api.mistral.ai/v1/files")
+        .header("Authorization", format!("Bearer {}", config.api_key))
+        .multipart(form)
+        .send()
+        .await
+        .map_err(|e| format!("HTTP request failed: {}", e))?;
+
+    if !response.status().is_success() {
+        let error_text = response.text().await.unwrap_or_default();
+        return Err(format!("Mistral upload failed: {}", error_text));
+    }
+
+    let response_text = response.text().await
+        .map_err(|e| format!("Failed to read response: {}", e))?;
+    
+    tracing::info!("Mistral file upload response: {}", response_text);
+    
+    let result: MistralFileResponse = serde_json::from_str(&response_text)
+        .map_err(|e| format!("Failed to parse response: {} - raw: {}", e, response_text))?;
+
+    tracing::info!("Parsed file upload: id={}, num_pages={:?}", result.id, result.num_pages);
+
+    Ok(result.id)
+}
+
+/// Process OCR for specific pages of an uploaded document.
+pub async fn ocr_pages(
+    config: &MistralConfig,
+    file_id: &str,
+    pages: &[usize],
+) -> Result<DocumentAnnotation, String> {
+    let client = reqwest::Client::builder()
+        .timeout(Duration::from_secs(config.timeout_secs))
+        .build()
+        .map_err(|e| format!("Failed to create HTTP client: {}", e))?;
+
+    // Load the complete schema from file
+    let schema_content = std::fs::read_to_string("ocr_schema.json")
+        .map_err(|e| format!("Failed to read ocr_schema.json: {}", e))?;
+    let mut schema: Value = serde_json::from_str(&schema_content)
+        .map_err(|e| format!("Failed to parse ocr_schema.json: {}", e))?;
+
+    // Clean the schema - remove meta-fields that Mistral echoes back
+    if let Some(obj) = schema.as_object_mut() {
+        obj.remove("$schema");
+        obj.remove("name");
+        obj.remove("description");
+    }
+    strip_descriptions(&mut schema);
+
+    let body = json!({
+        "model": config.ocr_model,
+        "document": {
+            "type": "file",
+            "file_id": file_id
+        },
+        "pages": pages,
+        "document_annotation_format": {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "LabReport",
+                "schema": schema
+            }
+        }
+    });
+
+    let response = client
+        .post("https://api.mistral.ai/v1/ocr")
+        .header("Authorization", format!("Bearer {}", config.api_key))
+        .header("Content-Type", "application/json")
+        .json(&body)
+        .send()
+        .await
+        .map_err(|e| format!("OCR request failed: {}", e))?;
+
+    if !response.status().is_success() {
+        let error_text = response.text().await.unwrap_or_default();
+        return Err(format!("OCR failed: {}", error_text));
+    }
+
+    let result: MistralOcrResponse = response
+        .json()
+        .await
+        .map_err(|e| format!("Failed to parse OCR response: {}", e))?;
+
+    let annotation_str = result
+        .document_annotation
+        .ok_or_else(|| "No document annotation in response".to_string())?;
+
+    tracing::debug!("Raw annotation from Mistral: {}", &annotation_str);
+
+    // Mistral returns data wrapped in "properties" - extract it
+    let raw_json: Value = serde_json::from_str(&annotation_str)
+        .map_err(|e| format!("Failed to parse raw JSON: {}", e))?;
+    
+    let data_json = if let Some(props) = raw_json.get("properties") {
+        props.clone()
+    } else {
+        raw_json
+    };
+
+    // Check if this is a schema-only response (no actual data)
+    if let Some(biomarkers) = data_json.get("biomarkers") {
+        if biomarkers.get("type").is_some() && biomarkers.get("items").is_some() {
+            tracing::warn!("Skipping schema-only response (no data for these pages)");
+            return Ok(DocumentAnnotation {
+                patient_name: None,
+                patient_age: None,
+                patient_gender: None,
+                lab_name: None,
+                test_date: None,
+                biomarkers: Some(vec![]),
+            });
+        }
+    }
+
+    let annotation = parse_annotation(&data_json)?;
+
+    tracing::info!("Parsed annotation: patient={:?}, biomarkers={}", 
+        annotation.patient_name, 
+        annotation.biomarkers.as_ref().map(|b| b.len()).unwrap_or(0));
+
+    Ok(annotation)
+}
+
+/// Parse annotation handling various Mistral response formats.
+fn parse_annotation(data: &Value) -> Result<DocumentAnnotation, String> {
+    let patient_name = data.get("patient_name").and_then(|v| v.as_str()).map(|s| s.to_string());
+    let patient_age = data.get("patient_age").and_then(|v| v.as_i64()).map(|n| n as i32);
+    let patient_gender = data.get("patient_gender").and_then(|v| v.as_str()).map(|s| s.to_string());
+    let lab_name = data.get("lab_name").and_then(|v| v.as_str()).map(|s| s.to_string());
+    let test_date = data.get("test_date").and_then(|v| v.as_str()).map(|s| s.to_string());
+
+    // Parse biomarkers - handle nested "properties" format
+    let biomarkers = if let Some(bm_array) = data.get("biomarkers").and_then(|v| v.as_array()) {
+        let mut parsed: Vec<Biomarker> = vec![];
+        for item in bm_array {
+            // Try direct format first
+            if let Some(name) = item.get("name").and_then(|v| v.as_str()) {
+                parsed.push(Biomarker {
+                    name: name.to_string(),
+                    value: item.get("value").and_then(|v| v.as_f64()),
+                    value_string: item.get("value_string").and_then(|v| v.as_str()).map(|s| s.to_string()),
+                    unit: item.get("unit").and_then(|v| v.as_str()).map(|s| s.to_string()),
+                });
+            }
+            // Try nested "properties" format
+            else if let Some(props) = item.get("properties") {
+                if let Some(name) = props.get("name").and_then(|v| v.as_str()) {
+                    parsed.push(Biomarker {
+                        name: name.to_string(),
+                        value: props.get("value").and_then(|v| v.as_f64()),
+                        value_string: props.get("value_string").and_then(|v| v.as_str()).map(|s| s.to_string()),
+                        unit: props.get("unit").and_then(|v| v.as_str()).map(|s| s.to_string()),
+                    });
+                }
+            }
+        }
+        Some(parsed)
+    } else {
+        Some(vec![])
+    };
+
+    Ok(DocumentAnnotation {
+        patient_name,
+        patient_age,
+        patient_gender,
+        lab_name,
+        test_date,
+        biomarkers,
+    })
+}
--- a/backend/src/handlers/ocr/mod.rs
+++ b/backend/src/handlers/ocr/mod.rs
@@ -0,0 +1,200 @@
+//! OCR API handlers - Mistral OCR integration for document parsing.
+
+mod matching;
+mod mistral;
+mod schema;
+mod types;
+
+use std::path::PathBuf;
+
+use axum::{
+    extract::{Path, State},
+    http::StatusCode,
+    Json,
+};
+use sea_orm::{ActiveModelTrait, EntityTrait, Set};
+
+use crate::models::bio::source;
+
+// Re-export public types
+pub use types::{ErrorResponse, OcrState, ParseResponse};
+
+/// Get page count from a local file.
+/// For PDFs, uses lopdf to read the actual page count.
+/// For other file types (images, etc.), returns 1.
+fn get_page_count(file_path: &PathBuf) -> usize {
+    let extension = file_path.extension()
+        .and_then(|e| e.to_str())
+        .unwrap_or("")
+        .to_lowercase();
+    
+    if extension == "pdf" {
+        match lopdf::Document::load(file_path) {
+            Ok(doc) => {
+                let count = doc.get_pages().len();
+                tracing::info!("PDF page count (local): {}", count);
+                count
+            }
+            Err(e) => {
+                tracing::warn!("Failed to read PDF page count: {}, defaulting to 1", e);
+                1
+            }
+        }
+    } else {
+        tracing::info!("Non-PDF file, treating as 1 page");
+        1
+    }
+}
+
+/// POST /api/sources/:id/parse - Parse a source document using Mistral OCR.
+pub async fn parse_source(
+    State(state): State<OcrState>,
+    Path(id): Path<i32>,
+) -> Result<Json<ParseResponse>, (StatusCode, Json<ErrorResponse>)> {
+    // 1. Get source from database
+    let source_entity = source::Entity::find_by_id(id)
+        .one(&state.db)
+        .await
+        .map_err(|e| {
+            (
+                StatusCode::INTERNAL_SERVER_ERROR,
+                Json(ErrorResponse {
+                    error: format!("Database error: {}", e),
+                }),
+            )
+        })?
+        .ok_or_else(|| {
+            (
+                StatusCode::NOT_FOUND,
+                Json(ErrorResponse {
+                    error: "Source not found".to_string(),
+                }),
+            )
+        })?;
+
+    let file_path = PathBuf::from(&source_entity.file_path);
+
+    // 2. Upload file to Mistral
+    let file_id = mistral::upload_to_mistral(&state.mistral, &file_path).await.map_err(|e| {
+        (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(ErrorResponse {
+                error: format!("Mistral upload failed: {}", e),
+            }),
+        )
+    })?;
+
+    // 3. Get page count locally from PDF
+    let max_pages = get_page_count(&file_path);
+    let chunk_size = state.mistral.max_pages_per_request as usize;
+    let max_retries = state.mistral.max_retries;
+    let mut all_results: Vec<types::DocumentAnnotation> = Vec::new();
+    let mut failed_chunk: Option<String> = None;
+
+    for start_page in (0..max_pages).step_by(chunk_size) {
+        let pages: Vec<usize> = (start_page..std::cmp::min(start_page + chunk_size, max_pages)).collect();
+
+        tracing::info!("Processing OCR for pages {:?}", pages);
+
+        // Retry loop for this chunk
+        let mut attempts = 0;
+        let mut chunk_result = None;
+        
+        while attempts <= max_retries {
+            match mistral::ocr_pages(&state.mistral, &file_id, &pages).await {
+                Ok(annotation) => {
+                    chunk_result = Some(annotation);
+                    break;
+                }
+                Err(e) => {
+                    if e.contains("out of range") || e.contains("no pages") || e.contains("Invalid page") {
+                        tracing::info!("Reached end of document at pages {:?}", pages);
+                        break;
+                    }
+                    
+                    attempts += 1;
+                    if attempts <= max_retries {
+                        tracing::warn!("OCR chunk error (pages {:?}), attempt {}/{}: {}", pages, attempts, max_retries + 1, e);
+                    } else {
+                        tracing::error!("OCR chunk failed after {} attempts (pages {:?}): {}", max_retries + 1, pages, e);
+                        failed_chunk = Some(format!("Pages {:?}: {}", pages, e));
+                    }
+                }
+            }
+        }
+        
+        if let Some(annotation) = chunk_result {
+            all_results.push(annotation);
+        } else if failed_chunk.is_some() {
+            break;
+        } else {
+            break;
+        }
+    }
+
+    // Fail if any chunk failed
+    if let Some(error_msg) = failed_chunk {
+        return Err((
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(ErrorResponse {
+                error: format!("OCR parsing failed: {}", error_msg),
+            }),
+        ));
+    }
+
+    if all_results.is_empty() {
+        return Err((
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(ErrorResponse {
+                error: "No OCR results obtained".to_string(),
+            }),
+        ));
+    }
+
+    // 4. Get valid biomarker names from schema
+    let valid_biomarkers = schema::extract_valid_biomarker_names().map_err(|e| {
+        (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(ErrorResponse {
+                error: format!("Failed to read schema: {}", e),
+            }),
+        )
+    })?;
+    
+    tracing::info!("Loaded {} valid biomarker names from schema", valid_biomarkers.len());
+
+    // 5. Merge results with fuzzy matching
+    let merged = matching::merge_results(all_results, &valid_biomarkers);
+
+    // 6. Save to database
+    let ocr_json = serde_json::to_string(&merged).map_err(|e| {
+        (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(ErrorResponse {
+                error: format!("JSON serialization failed: {}", e),
+            }),
+        )
+    })?;
+
+    let mut active_model: source::ActiveModel = source_entity.into();
+    active_model.ocr_data = Set(Some(ocr_json));
+
+    active_model.update(&state.db).await.map_err(|e| {
+        (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(ErrorResponse {
+                error: format!("Database update failed: {}", e),
+            }),
+        )
+    })?;
+
+    Ok(Json(ParseResponse {
+        success: true,
+        biomarkers_count: merged.biomarkers.len(),
+        message: format!(
+            "Successfully parsed {} biomarkers for {}",
+            merged.biomarkers.len(),
+            merged.patient_name.unwrap_or_else(|| "Unknown".to_string())
+        ),
+    }))
+}
--- a/backend/src/handlers/ocr/schema.rs
+++ b/backend/src/handlers/ocr/schema.rs
@@ -0,0 +1,49 @@
+//! Schema handling utilities.
+
+use serde_json::Value;
+use std::collections::HashSet;
+
+/// Extract valid biomarker names from the ocr_schema.json enum.
+pub fn extract_valid_biomarker_names() -> Result<HashSet<String>, String> {
+    let schema_content = std::fs::read_to_string("ocr_schema.json")
+        .map_err(|e| format!("Failed to read ocr_schema.json: {}", e))?;
+    let schema: Value = serde_json::from_str(&schema_content)
+        .map_err(|e| format!("Failed to parse ocr_schema.json: {}", e))?;
+    
+    // Navigate to: properties.biomarkers.items.properties.name.enum
+    let names = schema
+        .get("properties")
+        .and_then(|p| p.get("biomarkers"))
+        .and_then(|b| b.get("items"))
+        .and_then(|i| i.get("properties"))
+        .and_then(|p| p.get("name"))
+        .and_then(|n| n.get("enum"))
+        .and_then(|e| e.as_array())
+        .ok_or_else(|| "Could not find biomarker name enum in schema".to_string())?;
+    
+    let valid_names: HashSet<String> = names
+        .iter()
+        .filter_map(|v| v.as_str())
+        .map(|s| s.to_uppercase())
+        .collect();
+    
+    Ok(valid_names)
+}
+
+/// Recursively remove "description" fields from a JSON value.
+pub fn strip_descriptions(value: &mut Value) {
+    match value {
+        Value::Object(map) => {
+            map.remove("description");
+            for (_, v) in map.iter_mut() {
+                strip_descriptions(v);
+            }
+        }
+        Value::Array(arr) => {
+            for v in arr.iter_mut() {
+                strip_descriptions(v);
+            }
+        }
+        _ => {}
+    }
+}
--- a/backend/src/handlers/ocr/types.rs
+++ b/backend/src/handlers/ocr/types.rs
@@ -0,0 +1,77 @@
+//! Type definitions for OCR module.
+
+use sea_orm::DatabaseConnection;
+use serde::{Deserialize, Serialize};
+use std::path::PathBuf;
+
+use crate::config::MistralConfig;
+
+/// State for OCR handlers.
+#[derive(Clone)]
+pub struct OcrState {
+    pub db: DatabaseConnection,
+    pub uploads_path: PathBuf,
+    pub mistral: MistralConfig,
+}
+
+/// Response for parse endpoint.
+#[derive(Serialize)]
+pub struct ParseResponse {
+    pub success: bool,
+    pub biomarkers_count: usize,
+    pub message: String,
+}
+
+/// Error response.
+#[derive(Serialize)]
+pub struct ErrorResponse {
+    pub error: String,
+}
+
+/// Mistral file upload response.
+#[derive(Deserialize)]
+pub struct MistralFileResponse {
+    pub id: String,
+    #[allow(dead_code)]
+    pub bytes: i64,
+    pub num_pages: Option<usize>,
+}
+
+/// Mistral OCR response.
+#[derive(Deserialize)]
+pub struct MistralOcrResponse {
+    pub document_annotation: Option<String>,
+    #[allow(dead_code)]
+    pub pages: Option<Vec<serde_json::Value>>,
+}
+
+/// Extracted biomarker from OCR.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Biomarker {
+    pub name: String,
+    pub value: Option<f64>,
+    pub value_string: Option<String>,
+    pub unit: Option<String>,
+}
+
+/// Merged OCR result.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct OcrResult {
+    pub patient_name: Option<String>,
+    pub patient_age: Option<i32>,
+    pub patient_gender: Option<String>,
+    pub lab_name: Option<String>,
+    pub test_date: Option<String>,
+    pub biomarkers: Vec<Biomarker>,
+}
+
+/// Document annotation from Mistral.
+#[derive(Debug, Deserialize)]
+pub struct DocumentAnnotation {
+    pub patient_name: Option<String>,
+    pub patient_age: Option<i32>,
+    pub patient_gender: Option<String>,
+    pub lab_name: Option<String>,
+    pub test_date: Option<String>,
+    pub biomarkers: Option<Vec<Biomarker>>,
+}
--- a/backend/src/main.rs
+++ b/backend/src/main.rs
@@ -156,13 +156,26 @@ fn create_router(db: DatabaseConnection, config: &config::Config) -> Router {
        .route("/api/sources/{id}", get(handlers::sources::get_source)
            .delete(handlers::sources::delete_source))
        .route("/api/sources/{id}/ocr", put(handlers::sources::update_ocr))
+        .layer(axum::extract::DefaultBodyLimit::max(config.paths.max_upload_mb as usize * 1024 * 1024))
        .route_layer(middleware::from_fn(require_auth))
        .with_state(sources_state);

+    // OCR routes (need Mistral config)
+    let ocr_state = handlers::ocr::OcrState {
+        db: db.clone(),
+        uploads_path: PathBuf::from(&config.paths.uploads),
+        mistral: config.mistral.clone(),
+    };
+    let ocr_routes = Router::new()
+        .route("/api/sources/{id}/parse", post(handlers::ocr::parse_source))
+        .route_layer(middleware::from_fn(require_auth))
+        .with_state(ocr_state);
+
    Router::new()
        .merge(public_routes)
        .merge(protected_routes)
        .merge(sources_routes)
+        .merge(ocr_routes)
        .layer(auth_layer)
        .with_state(db)
 }
@@ -185,10 +198,18 @@ async fn require_auth(
 }

 fn init_logging(config: &config::Config) {
-    let log_level = config.logging.level.parse().unwrap_or(tracing::Level::INFO);
+    // Build filter: use configured level for our code, but restrict sqlx/sea_orm
+    let filter_str = format!(
+        "{},sqlx=warn,sea_orm=warn",
+        config.logging.level
+    );
+    
+    let filter = tracing_subscriber::filter::EnvFilter::try_new(&filter_str)
+        .unwrap_or_else(|_| tracing_subscriber::filter::EnvFilter::new("info,sqlx=warn,sea_orm=warn"));
+    
    tracing_subscriber::registry()
        .with(tracing_subscriber::fmt::layer())
-        .with(tracing_subscriber::filter::LevelFilter::from_level(log_level))
+        .with(filter)
        .init();
 }