feat: implement Mistral OCR document parsing with fuzzy matching and frontend integration

This commit is contained in:
2025-12-21 11:06:57 +05:30
parent c8b4beafff
commit fc6376abec
13 changed files with 1062 additions and 19 deletions

View File

@@ -47,3 +47,9 @@ regex = "1"
# CLI
argh = "0.1"
reqwest = { version = "0.12.26", features = ["multipart", "json"] }
serde_json = "1.0.145"
# PDF parsing for page count
lopdf = "0.36"
strsim = "0.11"

245
backend/ocr_schema.json Normal file
View File

@@ -0,0 +1,245 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"name": "LabReport",
"description": "Extract biomarker data from a medical lab report",
"type": "object",
"properties": {
"patient_name": {
"type": "string",
"description": "Full name of the patient"
},
"patient_age": {
"type": "integer",
"description": "Age of the patient in years"
},
"patient_gender": {
"type": "string",
"enum": [
"male",
"female",
"other"
],
"description": "Gender of the patient"
},
"lab_name": {
"type": "string",
"description": "Name of the laboratory"
},
"test_date": {
"type": "string",
"description": "Date when the sample was collected (YYYY-MM-DD format if possible)"
},
"report_id": {
"type": "string",
"description": "Report ID, barcode, or reference number"
},
"biomarkers": {
"type": "array",
"description": "List of biomarker test results",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "Name of the biomarker/test",
"enum": [
"ARSENIC",
"CADMIUM",
"MERCURY",
"LEAD",
"CHROMIUM",
"BARIUM",
"COBALT",
"CAESIUM",
"THALLIUM",
"URANIUM",
"STRONTIUM",
"ANTIMONY",
"TIN",
"MOLYBDENUM",
"SILVER",
"VANADIUM",
"BERYLLIUM",
"BISMUTH",
"SELENIUM",
"ALUMINIUM",
"NICKEL",
"MANGANESE",
"GLYCOSYLATED HEMOGLOBIN (HbA1c)",
"AVERAGE BLOOD GLUCOSE (ABG)",
"FASTING BLOOD SUGAR (GLUCOSE)",
"INSULIN FASTING",
"FRUCTOSAMINE",
"BLOOD KETONE (D3HB)",
"ERYTHROCYTE SEDIMENTATION RATE (ESR)",
"ANTI CCP (ACCP)",
"ANTI NUCLEAR ANTIBODIES (ANA)",
"HEMOGLOBIN",
"HEMATOCRIT (PCV)",
"TOTAL RED BLOOD CELL COUNT (RBC)",
"MEAN CORPUSCULAR VOLUME (MCV)",
"MEAN CORPUSCULAR HEMOGLOBIN (MCH)",
"MEAN CORP. HEMO. CONC (MCHC)",
"RED CELL DISTRIBUTION WIDTH - SD (RDW-SD)",
"RED CELL DISTRIBUTION WIDTH (RDW-CV)",
"TOTAL LEUCOCYTE COUNT (WBC)",
"NEUTROPHILS PERCENTAGE",
"LYMPHOCYTES PERCENTAGE",
"MONOCYTES PERCENTAGE",
"EOSINOPHILS PERCENTAGE",
"BASOPHILS PERCENTAGE",
"IMMATURE GRANULOCYTE PERCENTAGE (IG%)",
"NUCLEATED RED BLOOD CELLS %",
"NEUTROPHILS ABSOLUTE COUNT",
"LYMPHOCYTES ABSOLUTE COUNT",
"MONOCYTES - ABSOLUTE COUNT",
"BASOPHILS ABSOLUTE COUNT",
"EOSINOPHILS ABSOLUTE COUNT",
"IMMATURE GRANULOCYTES (IG)",
"NUCLEATED RED BLOOD CELLS",
"PLATELET COUNT",
"MEAN PLATELET VOLUME (MPV)",
"PLATELET DISTRIBUTION WIDTH (PDW)",
"PLATELET TO LARGE CELL RATIO (PLCR)",
"PLATELETCRIT (PCT)",
"VITAMIN A",
"VITAMIN E",
"VITAMIN K",
"VITAMIN B1 (THIAMIN)",
"VITAMIN B2 (RIBOFLAVIN)",
"VITAMIN B3 (NIACIN/NICOTINIC ACID)",
"VITAMIN B5 (PANTOTHENIC ACID)",
"VITAMIN B6 (PYRIDOXAL-5-PHOSPHATE)",
"VITAMIN B7 (BIOTIN)",
"VITAMIN B9 (FOLIC ACID)",
"VITAMIN B12 (COBALAMIN)",
"VITAMIN D TOTAL",
"VITAMIN D2",
"VITAMIN D3",
"CORTISOL",
"CORTICOSTERONE",
"ANDROSTENEDIONE",
"ESTRADIOL",
"TESTOSTERONE",
"PROGESTERONE",
"17-HYDROXYPROGESTERONE",
"DEHYDROEPIANDROSTERONE (DHEA)",
"DHEA - SULPHATE (DHEAS)",
"DEOXYCORTISOL",
"ALPHA-1-ANTITRYPSIN (AAT)",
"HOMOCYSTEINE",
"TROPONIN I",
"HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)",
"LIPOPROTEIN (A) [Lp(a)]",
"LIPOPROTEIN-ASSOCIATED PHOSPHOLIPASE A2 (LP-PLA2)",
"CYSTATIN C",
"BLOOD UREA NITROGEN (BUN)",
"UREA (CALCULATED)",
"CREATININE - SERUM",
"UREA / SR.CREATININE RATIO",
"BUN / SR.CREATININE RATIO",
"CALCIUM",
"URIC ACID",
"ESTIMATED GLOMERULAR FILTRATION RATE (eGFR)",
"TOTAL CHOLESTEROL",
"HDL CHOLESTEROL - DIRECT",
"LDL CHOLESTEROL - DIRECT",
"TRIGLYCERIDES",
"VLDL CHOLESTEROL",
"NON-HDL CHOLESTEROL",
"TC / HDL CHOLESTEROL RATIO",
"LDL / HDL RATIO",
"HDL / LDL RATIO",
"TRIG / HDL RATIO",
"APOLIPOPROTEIN - A1 (APO-A1)",
"APOLIPOPROTEIN - B (APO-B)",
"APO B / APO A1 RATIO",
"IRON",
"TOTAL IRON BINDING CAPACITY (TIBC)",
"% TRANSFERRIN SATURATION",
"FERRITIN",
"UNSAT. IRON-BINDING CAPACITY (UIBC)",
"ALKALINE PHOSPHATASE",
"BILIRUBIN - TOTAL",
"BILIRUBIN - DIRECT",
"BILIRUBIN (INDIRECT)",
"GAMMA GLUTAMYL TRANSFERASE (GGT)",
"ASPARTATE AMINOTRANSFERASE (SGOT)",
"ALANINE TRANSAMINASE (SGPT)",
"SGOT / SGPT RATIO",
"PROTEIN - TOTAL",
"ALBUMIN - SERUM",
"SERUM GLOBULIN",
"SERUM ALB/GLOBULIN RATIO",
"SODIUM",
"POTASSIUM",
"CHLORIDE",
"MAGNESIUM",
"TOTAL TRIIODOTHYRONINE (T3)",
"TOTAL THYROXINE (T4)",
"TSH ULTRASENSITIVE",
"SERUM COPPER",
"SERUM ZINC",
"AMYLASE",
"LIPASE",
"URINARY MICROALBUMIN",
"CREATININE - URINE",
"URI. ALBUMIN/CREATININE RATIO",
"URINE COLOUR",
"URINE APPEARANCE",
"URINE SPECIFIC GRAVITY",
"URINE PH",
"URINARY PROTEIN",
"URINARY GLUCOSE",
"URINE KETONE",
"URINARY BILIRUBIN",
"UROBILINOGEN",
"BILE SALT",
"BILE PIGMENT",
"URINE BLOOD",
"NITRITE",
"LEUCOCYTE ESTERASE",
"MUCUS",
"URINE RBC",
"URINARY LEUCOCYTES (PUS CELLS)",
"EPITHELIAL CELLS",
"CASTS",
"CRYSTALS",
"BACTERIA",
"YEAST",
"PARASITE",
"WEIGHT",
"HEIGHT",
"BODY MASS INDEX (BMI)",
"HEART RATE",
"BLOOD PRESSURE SYSTOLIC",
"BLOOD PRESSURE DIASTOLIC",
"OXYGEN SATURATION (SpO2)",
"BODY TEMPERATURE",
"STEPS",
"CALORIES BURNED"
]
},
"value": {
"type": "number",
"description": "Observed/measured value"
},
"value_string": {
"type": "string",
"description": "Value as string if non-numeric (e.g., 'Negative', 'Trace', '> 65')"
},
"unit": {
"type": "string",
"description": "Unit of measurement"
}
},
"required": [
"name"
]
}
}
},
"required": [
"biomarkers"
]
}

View File

@@ -10,6 +10,7 @@ paths:
database: "./data/zhealth.db"
logs: "./logs"
uploads: "./data/uploads"
max_upload_mb: 50 # Maximum file upload size in MB
logging:
level: "info" # Options: trace | debug | info | warn | error
@@ -29,3 +30,12 @@ ai:
provider: "gemini" # Options: gemini | openai | anthropic
model: "gemini-3-flash-preview"
api_key: "${AI_API_KEY}"
# Mistral OCR for document parsing
mistral:
api_key: "${MISTRAL_API_KEY}"
ocr_model: "mistral-ocr-latest"
max_pages_per_request: 8
max_retries: 2 # Max retry attempts per chunk
timeout_secs: 120 # Request timeout in seconds

View File

@@ -214,7 +214,7 @@ biomarkers:
# ============================================================================
# DIABETES / METABOLIC - Scale-based interpretations
# ============================================================================
- name: "HbA1c"
- name: "GLYCOSYLATED HEMOGLOBIN (HbA1c)"
test_category: DIABETES
category: metabolic
unit: "%"
@@ -339,7 +339,7 @@ biomarkers:
min: 36.0
max: 44.0
- name: "TOTAL RBC"
- name: "TOTAL RED BLOOD CELL COUNT (RBC)"
test_category: HEMOGRAM
category: blood
unit: "10^6/µL"
@@ -614,7 +614,7 @@ biomarkers:
min: 0.13
max: 1.19
- name: "VITAMIN B1/THIAMIN"
- name: "VITAMIN B1 (THIAMIN)"
test_category: VITAMIN
category: vitamins
unit: "ng/mL"
@@ -623,7 +623,7 @@ biomarkers:
min: 0.5
max: 4.0
- name: "VITAMIN B2/RIBOFLAVIN"
- name: "VITAMIN B2 (RIBOFLAVIN)"
test_category: VITAMIN
category: vitamins
unit: "ng/mL"
@@ -632,7 +632,7 @@ biomarkers:
min: 1.6
max: 68.2
- name: "VITAMIN B3/NICOTINIC ACID"
- name: "VITAMIN B3 (NIACIN/NICOTINIC ACID)"
test_category: VITAMIN
category: vitamins
unit: "ng/mL"
@@ -640,7 +640,7 @@ biomarkers:
reference:
max: 5.0
- name: "VITAMIN B5/PANTOTHENIC"
- name: "VITAMIN B5 (PANTOTHENIC ACID)"
test_category: VITAMIN
category: vitamins
unit: "ng/mL"
@@ -649,7 +649,7 @@ biomarkers:
min: 11.0
max: 150.0
- name: "VITAMIN B6/P5P"
- name: "VITAMIN B6 (PYRIDOXAL-5-PHOSPHATE)"
test_category: VITAMIN
category: vitamins
unit: "ng/mL"
@@ -658,7 +658,7 @@ biomarkers:
min: 5.0
max: 50.0
- name: "VITAMIN B7/BIOTIN"
- name: "VITAMIN B7 (BIOTIN)"
test_category: VITAMIN
category: vitamins
unit: "ng/mL"
@@ -667,7 +667,7 @@ biomarkers:
min: 0.2
max: 3.0
- name: "VITAMIN B9/FOLIC ACID"
- name: "VITAMIN B9 (FOLIC ACID)"
test_category: VITAMIN
category: vitamins
unit: "ng/mL"
@@ -676,7 +676,7 @@ biomarkers:
min: 0.2
max: 20.0
- name: "VITAMIN B-12"
- name: "VITAMIN B12 (COBALAMIN)"
test_category: VITAMIN
category: vitamins
unit: "pg/mL"
@@ -951,7 +951,7 @@ biomarkers:
- { min: 4, max: 10, label: "Moderate risk of future heart attack" }
- { min: 10, label: "Elevated risk of future heart attack" }
- name: "HS-CRP"
- name: "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)"
test_category: CARDIAC
category: cardiac
unit: "mg/L"
@@ -970,7 +970,7 @@ biomarkers:
reference:
max: 30.0
- name: "LP-PLA2"
- name: "LIPOPROTEIN-ASSOCIATED PHOSPHOLIPASE A2 (LP-PLA2)"
test_category: CARDIAC
category: cardiac
unit: "nmol/min/mL"
@@ -1062,7 +1062,7 @@ biomarkers:
min: 2.6
max: 6.0
- name: "eGFR"
- name: "ESTIMATED GLOMERULAR FILTRATION RATE (eGFR)"
test_category: RENAL
category: renal
unit: "mL/min/1.73m²"
@@ -1733,7 +1733,7 @@ biomarkers:
category: body
unit: "cm"
- name: "BMI"
- name: "BODY MASS INDEX (BMI)"
test_category: BODY
category: body
unit: "kg/m²"
@@ -1773,7 +1773,7 @@ biomarkers:
- { min: 80, max: 89, label: "High Blood Pressure Stage 1" }
- { min: 90, label: "High Blood Pressure Stage 2" }
- name: "SPO2"
- name: "OXYGEN SATURATION (SpO2)"
test_category: VITALS
category: vitals
unit: "%"

View File

@@ -12,6 +12,7 @@ pub struct Config {
pub auth: AuthConfig,
pub admin: AdminConfig,
pub ai: AiConfig,
pub mistral: MistralConfig,
}
#[derive(Debug, Deserialize)]
@@ -20,11 +21,12 @@ pub struct ServerConfig {
pub port: u16,
}
#[derive(Debug, Deserialize)]
#[derive(Debug, Deserialize, Clone)]
pub struct PathsConfig {
pub database: String,
pub logs: String,
pub uploads: String,
pub max_upload_mb: u32,
}
#[derive(Debug, Deserialize)]
@@ -53,6 +55,15 @@ pub struct AiConfig {
pub api_key: String,
}
#[derive(Debug, Deserialize, Clone)]
pub struct MistralConfig {
pub api_key: String,
pub ocr_model: String,
pub max_pages_per_request: u32,
pub max_retries: u32,
pub timeout_secs: u64,
}
impl Config {
/// Load configuration from a YAML file.
pub fn load<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {

View File

@@ -5,5 +5,6 @@ pub mod biomarkers;
pub mod categories;
pub mod diets;
pub mod entries;
pub mod ocr;
pub mod sources;
pub mod users;

View File

@@ -0,0 +1,180 @@
//! Biomarker matching and merging logic.
use std::collections::{HashMap, HashSet};
use strsim::jaro_winkler;
use super::types::{Biomarker, DocumentAnnotation, OcrResult};
/// Fuzzy matching threshold (0.0 - 1.0).
/// Names with Jaro-Winkler similarity >= this value are considered a match.
const FUZZY_THRESHOLD: f64 = 0.90;
/// Find a matching biomarker name from the valid set.
/// Returns the canonical name if found (exact, alias, or fuzzy match).
///
/// Matching order:
/// 1. Exact match on full name
/// 2. Extract parenthetical alias from INPUT (e.g., `(HS-CRP)` from `HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)`)
/// 3. Extract parenthetical alias from SCHEMA (e.g., `HS-CRP` matches `HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)`)
/// 4. Fuzzy match with Jaro-Winkler (threshold 0.90)
fn find_matching_biomarker(name: &str, valid_biomarkers: &HashSet<String>) -> Option<String> {
let name_upper = name.to_uppercase();
// 1. Exact match first (fast path)
if valid_biomarkers.contains(&name_upper) {
return Some(name_upper);
}
// 2. Try extracting parenthetical alias from INPUT
if let Some(alias) = extract_parenthetical_alias(&name_upper) {
if valid_biomarkers.contains(&alias) {
tracing::debug!(
"Alias matched '{}' -> '{}' (extracted from parentheses in input)",
name, alias
);
return Some(alias);
}
}
// 3. Try matching input against aliases in SCHEMA
// This handles input "HS-CRP" matching schema "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)"
for valid in valid_biomarkers {
if let Some(alias) = extract_parenthetical_alias(valid) {
if alias == name_upper {
tracing::debug!(
"Reverse alias matched '{}' -> '{}' (input is alias in schema)",
name, valid
);
return Some(valid.clone());
}
}
}
// 4. Fuzzy match with threshold
valid_biomarkers.iter()
.map(|valid| (valid, jaro_winkler(&name_upper, valid)))
.filter(|(_, score)| *score >= FUZZY_THRESHOLD)
.max_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
.map(|(matched_name, score)| {
tracing::debug!(
"Fuzzy matched '{}' -> '{}' (score: {:.3})",
name, matched_name, score
);
matched_name.clone()
})
}
/// Extract alias from parentheses or brackets at the end of a name.
/// Examples:
/// - "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)" -> "HS-CRP"
/// - "EST. GLOMERULAR FILTRATION RATE (eGFR)" -> "EGFR"
/// - "LIPOPROTEIN (A) [LP(A)]" -> None (nested parens too complex)
fn extract_parenthetical_alias(name: &str) -> Option<String> {
let name = name.trim();
// Look for trailing (ALIAS) pattern
if let Some(start) = name.rfind('(') {
if name.ends_with(')') {
let alias = &name[start + 1..name.len() - 1];
// Only use if it looks like an abbreviation (mostly uppercase, short)
if alias.len() >= 2 && alias.len() <= 15 && !alias.contains(' ') {
return Some(alias.to_uppercase());
}
}
}
None
}
/// Merge multiple OCR results into one, filtering to only known biomarkers.
/// Uses fuzzy matching to handle name variations.
pub fn merge_results(results: Vec<DocumentAnnotation>, valid_biomarkers: &HashSet<String>) -> OcrResult {
let mut merged = OcrResult {
patient_name: None,
patient_age: None,
patient_gender: None,
lab_name: None,
test_date: None,
biomarkers: Vec::new(),
};
// Track biomarkers by canonical name, prefer ones with actual values
let mut biomarker_map: HashMap<String, Biomarker> = HashMap::new();
let mut skipped_count = 0;
let mut fuzzy_matched_count = 0;
for result in results {
// Take first non-null metadata
if merged.patient_name.is_none() && result.patient_name.is_some() {
merged.patient_name = result.patient_name;
}
if merged.patient_age.is_none() && result.patient_age.is_some() {
merged.patient_age = result.patient_age;
}
if merged.patient_gender.is_none() && result.patient_gender.is_some() {
merged.patient_gender = result.patient_gender;
}
if merged.lab_name.is_none() && result.lab_name.is_some() {
merged.lab_name = result.lab_name;
}
if merged.test_date.is_none() && result.test_date.is_some() {
merged.test_date = result.test_date;
}
// Merge biomarkers with fuzzy matching
if let Some(biomarkers) = result.biomarkers {
for mut bm in biomarkers {
let original_name = bm.name.clone();
// Try to find a matching canonical name
let canonical_name = match find_matching_biomarker(&bm.name, valid_biomarkers) {
Some(matched) => {
if matched != bm.name.to_uppercase() {
fuzzy_matched_count += 1;
}
// Update the biomarker name to canonical form
bm.name = matched.clone();
matched
}
None => {
tracing::debug!("Skipping unknown biomarker: {}", original_name);
skipped_count += 1;
continue;
}
};
let has_real_value = bm.value.is_some() ||
bm.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false);
if let Some(existing) = biomarker_map.get(&canonical_name) {
let existing_has_real_value = existing.value.is_some() ||
existing.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false);
// Replace only if current has real value and existing doesn't
if has_real_value && !existing_has_real_value {
biomarker_map.insert(canonical_name, bm);
}
} else {
biomarker_map.insert(canonical_name, bm);
}
}
}
}
if skipped_count > 0 {
tracing::info!("Skipped {} unknown biomarkers not in schema", skipped_count);
}
if fuzzy_matched_count > 0 {
tracing::info!("Fuzzy matched {} biomarkers to canonical names", fuzzy_matched_count);
}
// Collect biomarkers from map, filtering out "Not Provided" only entries
merged.biomarkers = biomarker_map.into_values()
.filter(|bm| {
bm.value.is_some() ||
bm.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false)
})
.collect();
merged
}

View File

@@ -0,0 +1,211 @@
//! Mistral API integration for OCR.
use reqwest::multipart::{Form, Part};
use serde_json::{json, Value};
use std::path::PathBuf;
use std::time::Duration;
use tokio::fs;
use crate::config::MistralConfig;
use super::types::{Biomarker, DocumentAnnotation, MistralFileResponse, MistralOcrResponse};
use super::schema::strip_descriptions;
/// Upload a file to Mistral and return the file ID.
pub async fn upload_to_mistral(config: &MistralConfig, file_path: &PathBuf) -> Result<String, String> {
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(config.timeout_secs))
.build()
.map_err(|e| format!("Failed to create HTTP client: {}", e))?;
let file_bytes = fs::read(file_path)
.await
.map_err(|e| format!("Failed to read file: {}", e))?;
let file_name = file_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("document.pdf")
.to_string();
let part = Part::bytes(file_bytes)
.file_name(file_name)
.mime_str("application/pdf")
.map_err(|e| format!("MIME error: {}", e))?;
let form = Form::new()
.text("purpose", "ocr")
.part("file", part);
let response = client
.post("https://api.mistral.ai/v1/files")
.header("Authorization", format!("Bearer {}", config.api_key))
.multipart(form)
.send()
.await
.map_err(|e| format!("HTTP request failed: {}", e))?;
if !response.status().is_success() {
let error_text = response.text().await.unwrap_or_default();
return Err(format!("Mistral upload failed: {}", error_text));
}
let response_text = response.text().await
.map_err(|e| format!("Failed to read response: {}", e))?;
tracing::info!("Mistral file upload response: {}", response_text);
let result: MistralFileResponse = serde_json::from_str(&response_text)
.map_err(|e| format!("Failed to parse response: {} - raw: {}", e, response_text))?;
tracing::info!("Parsed file upload: id={}, num_pages={:?}", result.id, result.num_pages);
Ok(result.id)
}
/// Process OCR for specific pages of an uploaded document.
pub async fn ocr_pages(
config: &MistralConfig,
file_id: &str,
pages: &[usize],
) -> Result<DocumentAnnotation, String> {
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(config.timeout_secs))
.build()
.map_err(|e| format!("Failed to create HTTP client: {}", e))?;
// Load the complete schema from file
let schema_content = std::fs::read_to_string("ocr_schema.json")
.map_err(|e| format!("Failed to read ocr_schema.json: {}", e))?;
let mut schema: Value = serde_json::from_str(&schema_content)
.map_err(|e| format!("Failed to parse ocr_schema.json: {}", e))?;
// Clean the schema - remove meta-fields that Mistral echoes back
if let Some(obj) = schema.as_object_mut() {
obj.remove("$schema");
obj.remove("name");
obj.remove("description");
}
strip_descriptions(&mut schema);
let body = json!({
"model": config.ocr_model,
"document": {
"type": "file",
"file_id": file_id
},
"pages": pages,
"document_annotation_format": {
"type": "json_schema",
"json_schema": {
"name": "LabReport",
"schema": schema
}
}
});
let response = client
.post("https://api.mistral.ai/v1/ocr")
.header("Authorization", format!("Bearer {}", config.api_key))
.header("Content-Type", "application/json")
.json(&body)
.send()
.await
.map_err(|e| format!("OCR request failed: {}", e))?;
if !response.status().is_success() {
let error_text = response.text().await.unwrap_or_default();
return Err(format!("OCR failed: {}", error_text));
}
let result: MistralOcrResponse = response
.json()
.await
.map_err(|e| format!("Failed to parse OCR response: {}", e))?;
let annotation_str = result
.document_annotation
.ok_or_else(|| "No document annotation in response".to_string())?;
tracing::debug!("Raw annotation from Mistral: {}", &annotation_str);
// Mistral returns data wrapped in "properties" - extract it
let raw_json: Value = serde_json::from_str(&annotation_str)
.map_err(|e| format!("Failed to parse raw JSON: {}", e))?;
let data_json = if let Some(props) = raw_json.get("properties") {
props.clone()
} else {
raw_json
};
// Check if this is a schema-only response (no actual data)
if let Some(biomarkers) = data_json.get("biomarkers") {
if biomarkers.get("type").is_some() && biomarkers.get("items").is_some() {
tracing::warn!("Skipping schema-only response (no data for these pages)");
return Ok(DocumentAnnotation {
patient_name: None,
patient_age: None,
patient_gender: None,
lab_name: None,
test_date: None,
biomarkers: Some(vec![]),
});
}
}
let annotation = parse_annotation(&data_json)?;
tracing::info!("Parsed annotation: patient={:?}, biomarkers={}",
annotation.patient_name,
annotation.biomarkers.as_ref().map(|b| b.len()).unwrap_or(0));
Ok(annotation)
}
/// Parse annotation handling various Mistral response formats.
fn parse_annotation(data: &Value) -> Result<DocumentAnnotation, String> {
let patient_name = data.get("patient_name").and_then(|v| v.as_str()).map(|s| s.to_string());
let patient_age = data.get("patient_age").and_then(|v| v.as_i64()).map(|n| n as i32);
let patient_gender = data.get("patient_gender").and_then(|v| v.as_str()).map(|s| s.to_string());
let lab_name = data.get("lab_name").and_then(|v| v.as_str()).map(|s| s.to_string());
let test_date = data.get("test_date").and_then(|v| v.as_str()).map(|s| s.to_string());
// Parse biomarkers - handle nested "properties" format
let biomarkers = if let Some(bm_array) = data.get("biomarkers").and_then(|v| v.as_array()) {
let mut parsed: Vec<Biomarker> = vec![];
for item in bm_array {
// Try direct format first
if let Some(name) = item.get("name").and_then(|v| v.as_str()) {
parsed.push(Biomarker {
name: name.to_string(),
value: item.get("value").and_then(|v| v.as_f64()),
value_string: item.get("value_string").and_then(|v| v.as_str()).map(|s| s.to_string()),
unit: item.get("unit").and_then(|v| v.as_str()).map(|s| s.to_string()),
});
}
// Try nested "properties" format
else if let Some(props) = item.get("properties") {
if let Some(name) = props.get("name").and_then(|v| v.as_str()) {
parsed.push(Biomarker {
name: name.to_string(),
value: props.get("value").and_then(|v| v.as_f64()),
value_string: props.get("value_string").and_then(|v| v.as_str()).map(|s| s.to_string()),
unit: props.get("unit").and_then(|v| v.as_str()).map(|s| s.to_string()),
});
}
}
}
Some(parsed)
} else {
Some(vec![])
};
Ok(DocumentAnnotation {
patient_name,
patient_age,
patient_gender,
lab_name,
test_date,
biomarkers,
})
}

View File

@@ -0,0 +1,200 @@
//! OCR API handlers - Mistral OCR integration for document parsing.
mod matching;
mod mistral;
mod schema;
mod types;
use std::path::PathBuf;
use axum::{
extract::{Path, State},
http::StatusCode,
Json,
};
use sea_orm::{ActiveModelTrait, EntityTrait, Set};
use crate::models::bio::source;
// Re-export public types
pub use types::{ErrorResponse, OcrState, ParseResponse};
/// Get page count from a local file.
/// For PDFs, uses lopdf to read the actual page count.
/// For other file types (images, etc.), returns 1.
fn get_page_count(file_path: &PathBuf) -> usize {
let extension = file_path.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
if extension == "pdf" {
match lopdf::Document::load(file_path) {
Ok(doc) => {
let count = doc.get_pages().len();
tracing::info!("PDF page count (local): {}", count);
count
}
Err(e) => {
tracing::warn!("Failed to read PDF page count: {}, defaulting to 1", e);
1
}
}
} else {
tracing::info!("Non-PDF file, treating as 1 page");
1
}
}
/// POST /api/sources/:id/parse - Parse a source document using Mistral OCR.
pub async fn parse_source(
State(state): State<OcrState>,
Path(id): Path<i32>,
) -> Result<Json<ParseResponse>, (StatusCode, Json<ErrorResponse>)> {
// 1. Get source from database
let source_entity = source::Entity::find_by_id(id)
.one(&state.db)
.await
.map_err(|e| {
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("Database error: {}", e),
}),
)
})?
.ok_or_else(|| {
(
StatusCode::NOT_FOUND,
Json(ErrorResponse {
error: "Source not found".to_string(),
}),
)
})?;
let file_path = PathBuf::from(&source_entity.file_path);
// 2. Upload file to Mistral
let file_id = mistral::upload_to_mistral(&state.mistral, &file_path).await.map_err(|e| {
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("Mistral upload failed: {}", e),
}),
)
})?;
// 3. Get page count locally from PDF
let max_pages = get_page_count(&file_path);
let chunk_size = state.mistral.max_pages_per_request as usize;
let max_retries = state.mistral.max_retries;
let mut all_results: Vec<types::DocumentAnnotation> = Vec::new();
let mut failed_chunk: Option<String> = None;
for start_page in (0..max_pages).step_by(chunk_size) {
let pages: Vec<usize> = (start_page..std::cmp::min(start_page + chunk_size, max_pages)).collect();
tracing::info!("Processing OCR for pages {:?}", pages);
// Retry loop for this chunk
let mut attempts = 0;
let mut chunk_result = None;
while attempts <= max_retries {
match mistral::ocr_pages(&state.mistral, &file_id, &pages).await {
Ok(annotation) => {
chunk_result = Some(annotation);
break;
}
Err(e) => {
if e.contains("out of range") || e.contains("no pages") || e.contains("Invalid page") {
tracing::info!("Reached end of document at pages {:?}", pages);
break;
}
attempts += 1;
if attempts <= max_retries {
tracing::warn!("OCR chunk error (pages {:?}), attempt {}/{}: {}", pages, attempts, max_retries + 1, e);
} else {
tracing::error!("OCR chunk failed after {} attempts (pages {:?}): {}", max_retries + 1, pages, e);
failed_chunk = Some(format!("Pages {:?}: {}", pages, e));
}
}
}
}
if let Some(annotation) = chunk_result {
all_results.push(annotation);
} else if failed_chunk.is_some() {
break;
} else {
break;
}
}
// Fail if any chunk failed
if let Some(error_msg) = failed_chunk {
return Err((
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("OCR parsing failed: {}", error_msg),
}),
));
}
if all_results.is_empty() {
return Err((
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: "No OCR results obtained".to_string(),
}),
));
}
// 4. Get valid biomarker names from schema
let valid_biomarkers = schema::extract_valid_biomarker_names().map_err(|e| {
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("Failed to read schema: {}", e),
}),
)
})?;
tracing::info!("Loaded {} valid biomarker names from schema", valid_biomarkers.len());
// 5. Merge results with fuzzy matching
let merged = matching::merge_results(all_results, &valid_biomarkers);
// 6. Save to database
let ocr_json = serde_json::to_string(&merged).map_err(|e| {
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("JSON serialization failed: {}", e),
}),
)
})?;
let mut active_model: source::ActiveModel = source_entity.into();
active_model.ocr_data = Set(Some(ocr_json));
active_model.update(&state.db).await.map_err(|e| {
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(ErrorResponse {
error: format!("Database update failed: {}", e),
}),
)
})?;
Ok(Json(ParseResponse {
success: true,
biomarkers_count: merged.biomarkers.len(),
message: format!(
"Successfully parsed {} biomarkers for {}",
merged.biomarkers.len(),
merged.patient_name.unwrap_or_else(|| "Unknown".to_string())
),
}))
}

View File

@@ -0,0 +1,49 @@
//! Schema handling utilities.
use serde_json::Value;
use std::collections::HashSet;
/// Extract valid biomarker names from the ocr_schema.json enum.
pub fn extract_valid_biomarker_names() -> Result<HashSet<String>, String> {
let schema_content = std::fs::read_to_string("ocr_schema.json")
.map_err(|e| format!("Failed to read ocr_schema.json: {}", e))?;
let schema: Value = serde_json::from_str(&schema_content)
.map_err(|e| format!("Failed to parse ocr_schema.json: {}", e))?;
// Navigate to: properties.biomarkers.items.properties.name.enum
let names = schema
.get("properties")
.and_then(|p| p.get("biomarkers"))
.and_then(|b| b.get("items"))
.and_then(|i| i.get("properties"))
.and_then(|p| p.get("name"))
.and_then(|n| n.get("enum"))
.and_then(|e| e.as_array())
.ok_or_else(|| "Could not find biomarker name enum in schema".to_string())?;
let valid_names: HashSet<String> = names
.iter()
.filter_map(|v| v.as_str())
.map(|s| s.to_uppercase())
.collect();
Ok(valid_names)
}
/// Recursively remove "description" fields from a JSON value.
pub fn strip_descriptions(value: &mut Value) {
match value {
Value::Object(map) => {
map.remove("description");
for (_, v) in map.iter_mut() {
strip_descriptions(v);
}
}
Value::Array(arr) => {
for v in arr.iter_mut() {
strip_descriptions(v);
}
}
_ => {}
}
}

View File

@@ -0,0 +1,77 @@
//! Type definitions for OCR module.
use sea_orm::DatabaseConnection;
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use crate::config::MistralConfig;
/// State for OCR handlers.
#[derive(Clone)]
pub struct OcrState {
pub db: DatabaseConnection,
pub uploads_path: PathBuf,
pub mistral: MistralConfig,
}
/// Response for parse endpoint.
#[derive(Serialize)]
pub struct ParseResponse {
pub success: bool,
pub biomarkers_count: usize,
pub message: String,
}
/// Error response.
#[derive(Serialize)]
pub struct ErrorResponse {
pub error: String,
}
/// Mistral file upload response.
#[derive(Deserialize)]
pub struct MistralFileResponse {
pub id: String,
#[allow(dead_code)]
pub bytes: i64,
pub num_pages: Option<usize>,
}
/// Mistral OCR response.
#[derive(Deserialize)]
pub struct MistralOcrResponse {
pub document_annotation: Option<String>,
#[allow(dead_code)]
pub pages: Option<Vec<serde_json::Value>>,
}
/// Extracted biomarker from OCR.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Biomarker {
pub name: String,
pub value: Option<f64>,
pub value_string: Option<String>,
pub unit: Option<String>,
}
/// Merged OCR result.
#[derive(Debug, Serialize, Deserialize)]
pub struct OcrResult {
pub patient_name: Option<String>,
pub patient_age: Option<i32>,
pub patient_gender: Option<String>,
pub lab_name: Option<String>,
pub test_date: Option<String>,
pub biomarkers: Vec<Biomarker>,
}
/// Document annotation from Mistral.
#[derive(Debug, Deserialize)]
pub struct DocumentAnnotation {
pub patient_name: Option<String>,
pub patient_age: Option<i32>,
pub patient_gender: Option<String>,
pub lab_name: Option<String>,
pub test_date: Option<String>,
pub biomarkers: Option<Vec<Biomarker>>,
}

View File

@@ -156,13 +156,26 @@ fn create_router(db: DatabaseConnection, config: &config::Config) -> Router {
.route("/api/sources/{id}", get(handlers::sources::get_source)
.delete(handlers::sources::delete_source))
.route("/api/sources/{id}/ocr", put(handlers::sources::update_ocr))
.layer(axum::extract::DefaultBodyLimit::max(config.paths.max_upload_mb as usize * 1024 * 1024))
.route_layer(middleware::from_fn(require_auth))
.with_state(sources_state);
// OCR routes (need Mistral config)
let ocr_state = handlers::ocr::OcrState {
db: db.clone(),
uploads_path: PathBuf::from(&config.paths.uploads),
mistral: config.mistral.clone(),
};
let ocr_routes = Router::new()
.route("/api/sources/{id}/parse", post(handlers::ocr::parse_source))
.route_layer(middleware::from_fn(require_auth))
.with_state(ocr_state);
Router::new()
.merge(public_routes)
.merge(protected_routes)
.merge(sources_routes)
.merge(ocr_routes)
.layer(auth_layer)
.with_state(db)
}
@@ -185,10 +198,18 @@ async fn require_auth(
}
fn init_logging(config: &config::Config) {
let log_level = config.logging.level.parse().unwrap_or(tracing::Level::INFO);
// Build filter: use configured level for our code, but restrict sqlx/sea_orm
let filter_str = format!(
"{},sqlx=warn,sea_orm=warn",
config.logging.level
);
let filter = tracing_subscriber::filter::EnvFilter::try_new(&filter_str)
.unwrap_or_else(|_| tracing_subscriber::filter::EnvFilter::new("info,sqlx=warn,sea_orm=warn"));
tracing_subscriber::registry()
.with(tracing_subscriber::fmt::layer())
.with(tracing_subscriber::filter::LevelFilter::from_level(log_level))
.with(filter)
.init();
}