feat: implement Mistral OCR document parsing with fuzzy matching and frontend integration
This commit is contained in:
@@ -47,3 +47,9 @@ regex = "1"
|
||||
|
||||
# CLI
|
||||
argh = "0.1"
|
||||
reqwest = { version = "0.12.26", features = ["multipart", "json"] }
|
||||
serde_json = "1.0.145"
|
||||
|
||||
# PDF parsing for page count
|
||||
lopdf = "0.36"
|
||||
strsim = "0.11"
|
||||
|
||||
245
backend/ocr_schema.json
Normal file
245
backend/ocr_schema.json
Normal file
@@ -0,0 +1,245 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"name": "LabReport",
|
||||
"description": "Extract biomarker data from a medical lab report",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"patient_name": {
|
||||
"type": "string",
|
||||
"description": "Full name of the patient"
|
||||
},
|
||||
"patient_age": {
|
||||
"type": "integer",
|
||||
"description": "Age of the patient in years"
|
||||
},
|
||||
"patient_gender": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"male",
|
||||
"female",
|
||||
"other"
|
||||
],
|
||||
"description": "Gender of the patient"
|
||||
},
|
||||
"lab_name": {
|
||||
"type": "string",
|
||||
"description": "Name of the laboratory"
|
||||
},
|
||||
"test_date": {
|
||||
"type": "string",
|
||||
"description": "Date when the sample was collected (YYYY-MM-DD format if possible)"
|
||||
},
|
||||
"report_id": {
|
||||
"type": "string",
|
||||
"description": "Report ID, barcode, or reference number"
|
||||
},
|
||||
"biomarkers": {
|
||||
"type": "array",
|
||||
"description": "List of biomarker test results",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Name of the biomarker/test",
|
||||
"enum": [
|
||||
"ARSENIC",
|
||||
"CADMIUM",
|
||||
"MERCURY",
|
||||
"LEAD",
|
||||
"CHROMIUM",
|
||||
"BARIUM",
|
||||
"COBALT",
|
||||
"CAESIUM",
|
||||
"THALLIUM",
|
||||
"URANIUM",
|
||||
"STRONTIUM",
|
||||
"ANTIMONY",
|
||||
"TIN",
|
||||
"MOLYBDENUM",
|
||||
"SILVER",
|
||||
"VANADIUM",
|
||||
"BERYLLIUM",
|
||||
"BISMUTH",
|
||||
"SELENIUM",
|
||||
"ALUMINIUM",
|
||||
"NICKEL",
|
||||
"MANGANESE",
|
||||
"GLYCOSYLATED HEMOGLOBIN (HbA1c)",
|
||||
"AVERAGE BLOOD GLUCOSE (ABG)",
|
||||
"FASTING BLOOD SUGAR (GLUCOSE)",
|
||||
"INSULIN FASTING",
|
||||
"FRUCTOSAMINE",
|
||||
"BLOOD KETONE (D3HB)",
|
||||
"ERYTHROCYTE SEDIMENTATION RATE (ESR)",
|
||||
"ANTI CCP (ACCP)",
|
||||
"ANTI NUCLEAR ANTIBODIES (ANA)",
|
||||
"HEMOGLOBIN",
|
||||
"HEMATOCRIT (PCV)",
|
||||
"TOTAL RED BLOOD CELL COUNT (RBC)",
|
||||
"MEAN CORPUSCULAR VOLUME (MCV)",
|
||||
"MEAN CORPUSCULAR HEMOGLOBIN (MCH)",
|
||||
"MEAN CORP. HEMO. CONC (MCHC)",
|
||||
"RED CELL DISTRIBUTION WIDTH - SD (RDW-SD)",
|
||||
"RED CELL DISTRIBUTION WIDTH (RDW-CV)",
|
||||
"TOTAL LEUCOCYTE COUNT (WBC)",
|
||||
"NEUTROPHILS PERCENTAGE",
|
||||
"LYMPHOCYTES PERCENTAGE",
|
||||
"MONOCYTES PERCENTAGE",
|
||||
"EOSINOPHILS PERCENTAGE",
|
||||
"BASOPHILS PERCENTAGE",
|
||||
"IMMATURE GRANULOCYTE PERCENTAGE (IG%)",
|
||||
"NUCLEATED RED BLOOD CELLS %",
|
||||
"NEUTROPHILS ABSOLUTE COUNT",
|
||||
"LYMPHOCYTES ABSOLUTE COUNT",
|
||||
"MONOCYTES - ABSOLUTE COUNT",
|
||||
"BASOPHILS ABSOLUTE COUNT",
|
||||
"EOSINOPHILS ABSOLUTE COUNT",
|
||||
"IMMATURE GRANULOCYTES (IG)",
|
||||
"NUCLEATED RED BLOOD CELLS",
|
||||
"PLATELET COUNT",
|
||||
"MEAN PLATELET VOLUME (MPV)",
|
||||
"PLATELET DISTRIBUTION WIDTH (PDW)",
|
||||
"PLATELET TO LARGE CELL RATIO (PLCR)",
|
||||
"PLATELETCRIT (PCT)",
|
||||
"VITAMIN A",
|
||||
"VITAMIN E",
|
||||
"VITAMIN K",
|
||||
"VITAMIN B1 (THIAMIN)",
|
||||
"VITAMIN B2 (RIBOFLAVIN)",
|
||||
"VITAMIN B3 (NIACIN/NICOTINIC ACID)",
|
||||
"VITAMIN B5 (PANTOTHENIC ACID)",
|
||||
"VITAMIN B6 (PYRIDOXAL-5-PHOSPHATE)",
|
||||
"VITAMIN B7 (BIOTIN)",
|
||||
"VITAMIN B9 (FOLIC ACID)",
|
||||
"VITAMIN B12 (COBALAMIN)",
|
||||
"VITAMIN D TOTAL",
|
||||
"VITAMIN D2",
|
||||
"VITAMIN D3",
|
||||
"CORTISOL",
|
||||
"CORTICOSTERONE",
|
||||
"ANDROSTENEDIONE",
|
||||
"ESTRADIOL",
|
||||
"TESTOSTERONE",
|
||||
"PROGESTERONE",
|
||||
"17-HYDROXYPROGESTERONE",
|
||||
"DEHYDROEPIANDROSTERONE (DHEA)",
|
||||
"DHEA - SULPHATE (DHEAS)",
|
||||
"DEOXYCORTISOL",
|
||||
"ALPHA-1-ANTITRYPSIN (AAT)",
|
||||
"HOMOCYSTEINE",
|
||||
"TROPONIN I",
|
||||
"HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)",
|
||||
"LIPOPROTEIN (A) [Lp(a)]",
|
||||
"LIPOPROTEIN-ASSOCIATED PHOSPHOLIPASE A2 (LP-PLA2)",
|
||||
"CYSTATIN C",
|
||||
"BLOOD UREA NITROGEN (BUN)",
|
||||
"UREA (CALCULATED)",
|
||||
"CREATININE - SERUM",
|
||||
"UREA / SR.CREATININE RATIO",
|
||||
"BUN / SR.CREATININE RATIO",
|
||||
"CALCIUM",
|
||||
"URIC ACID",
|
||||
"ESTIMATED GLOMERULAR FILTRATION RATE (eGFR)",
|
||||
"TOTAL CHOLESTEROL",
|
||||
"HDL CHOLESTEROL - DIRECT",
|
||||
"LDL CHOLESTEROL - DIRECT",
|
||||
"TRIGLYCERIDES",
|
||||
"VLDL CHOLESTEROL",
|
||||
"NON-HDL CHOLESTEROL",
|
||||
"TC / HDL CHOLESTEROL RATIO",
|
||||
"LDL / HDL RATIO",
|
||||
"HDL / LDL RATIO",
|
||||
"TRIG / HDL RATIO",
|
||||
"APOLIPOPROTEIN - A1 (APO-A1)",
|
||||
"APOLIPOPROTEIN - B (APO-B)",
|
||||
"APO B / APO A1 RATIO",
|
||||
"IRON",
|
||||
"TOTAL IRON BINDING CAPACITY (TIBC)",
|
||||
"% TRANSFERRIN SATURATION",
|
||||
"FERRITIN",
|
||||
"UNSAT. IRON-BINDING CAPACITY (UIBC)",
|
||||
"ALKALINE PHOSPHATASE",
|
||||
"BILIRUBIN - TOTAL",
|
||||
"BILIRUBIN - DIRECT",
|
||||
"BILIRUBIN (INDIRECT)",
|
||||
"GAMMA GLUTAMYL TRANSFERASE (GGT)",
|
||||
"ASPARTATE AMINOTRANSFERASE (SGOT)",
|
||||
"ALANINE TRANSAMINASE (SGPT)",
|
||||
"SGOT / SGPT RATIO",
|
||||
"PROTEIN - TOTAL",
|
||||
"ALBUMIN - SERUM",
|
||||
"SERUM GLOBULIN",
|
||||
"SERUM ALB/GLOBULIN RATIO",
|
||||
"SODIUM",
|
||||
"POTASSIUM",
|
||||
"CHLORIDE",
|
||||
"MAGNESIUM",
|
||||
"TOTAL TRIIODOTHYRONINE (T3)",
|
||||
"TOTAL THYROXINE (T4)",
|
||||
"TSH ULTRASENSITIVE",
|
||||
"SERUM COPPER",
|
||||
"SERUM ZINC",
|
||||
"AMYLASE",
|
||||
"LIPASE",
|
||||
"URINARY MICROALBUMIN",
|
||||
"CREATININE - URINE",
|
||||
"URI. ALBUMIN/CREATININE RATIO",
|
||||
"URINE COLOUR",
|
||||
"URINE APPEARANCE",
|
||||
"URINE SPECIFIC GRAVITY",
|
||||
"URINE PH",
|
||||
"URINARY PROTEIN",
|
||||
"URINARY GLUCOSE",
|
||||
"URINE KETONE",
|
||||
"URINARY BILIRUBIN",
|
||||
"UROBILINOGEN",
|
||||
"BILE SALT",
|
||||
"BILE PIGMENT",
|
||||
"URINE BLOOD",
|
||||
"NITRITE",
|
||||
"LEUCOCYTE ESTERASE",
|
||||
"MUCUS",
|
||||
"URINE RBC",
|
||||
"URINARY LEUCOCYTES (PUS CELLS)",
|
||||
"EPITHELIAL CELLS",
|
||||
"CASTS",
|
||||
"CRYSTALS",
|
||||
"BACTERIA",
|
||||
"YEAST",
|
||||
"PARASITE",
|
||||
"WEIGHT",
|
||||
"HEIGHT",
|
||||
"BODY MASS INDEX (BMI)",
|
||||
"HEART RATE",
|
||||
"BLOOD PRESSURE SYSTOLIC",
|
||||
"BLOOD PRESSURE DIASTOLIC",
|
||||
"OXYGEN SATURATION (SpO2)",
|
||||
"BODY TEMPERATURE",
|
||||
"STEPS",
|
||||
"CALORIES BURNED"
|
||||
]
|
||||
},
|
||||
"value": {
|
||||
"type": "number",
|
||||
"description": "Observed/measured value"
|
||||
},
|
||||
"value_string": {
|
||||
"type": "string",
|
||||
"description": "Value as string if non-numeric (e.g., 'Negative', 'Trace', '> 65')"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "Unit of measurement"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"biomarkers"
|
||||
]
|
||||
}
|
||||
@@ -10,6 +10,7 @@ paths:
|
||||
database: "./data/zhealth.db"
|
||||
logs: "./logs"
|
||||
uploads: "./data/uploads"
|
||||
max_upload_mb: 50 # Maximum file upload size in MB
|
||||
|
||||
logging:
|
||||
level: "info" # Options: trace | debug | info | warn | error
|
||||
@@ -29,3 +30,12 @@ ai:
|
||||
provider: "gemini" # Options: gemini | openai | anthropic
|
||||
model: "gemini-3-flash-preview"
|
||||
api_key: "${AI_API_KEY}"
|
||||
|
||||
# Mistral OCR for document parsing
|
||||
mistral:
|
||||
api_key: "${MISTRAL_API_KEY}"
|
||||
ocr_model: "mistral-ocr-latest"
|
||||
max_pages_per_request: 8
|
||||
max_retries: 2 # Max retry attempts per chunk
|
||||
timeout_secs: 120 # Request timeout in seconds
|
||||
|
||||
|
||||
@@ -214,7 +214,7 @@ biomarkers:
|
||||
# ============================================================================
|
||||
# DIABETES / METABOLIC - Scale-based interpretations
|
||||
# ============================================================================
|
||||
- name: "HbA1c"
|
||||
- name: "GLYCOSYLATED HEMOGLOBIN (HbA1c)"
|
||||
test_category: DIABETES
|
||||
category: metabolic
|
||||
unit: "%"
|
||||
@@ -339,7 +339,7 @@ biomarkers:
|
||||
min: 36.0
|
||||
max: 44.0
|
||||
|
||||
- name: "TOTAL RBC"
|
||||
- name: "TOTAL RED BLOOD CELL COUNT (RBC)"
|
||||
test_category: HEMOGRAM
|
||||
category: blood
|
||||
unit: "10^6/µL"
|
||||
@@ -614,7 +614,7 @@ biomarkers:
|
||||
min: 0.13
|
||||
max: 1.19
|
||||
|
||||
- name: "VITAMIN B1/THIAMIN"
|
||||
- name: "VITAMIN B1 (THIAMIN)"
|
||||
test_category: VITAMIN
|
||||
category: vitamins
|
||||
unit: "ng/mL"
|
||||
@@ -623,7 +623,7 @@ biomarkers:
|
||||
min: 0.5
|
||||
max: 4.0
|
||||
|
||||
- name: "VITAMIN B2/RIBOFLAVIN"
|
||||
- name: "VITAMIN B2 (RIBOFLAVIN)"
|
||||
test_category: VITAMIN
|
||||
category: vitamins
|
||||
unit: "ng/mL"
|
||||
@@ -632,7 +632,7 @@ biomarkers:
|
||||
min: 1.6
|
||||
max: 68.2
|
||||
|
||||
- name: "VITAMIN B3/NICOTINIC ACID"
|
||||
- name: "VITAMIN B3 (NIACIN/NICOTINIC ACID)"
|
||||
test_category: VITAMIN
|
||||
category: vitamins
|
||||
unit: "ng/mL"
|
||||
@@ -640,7 +640,7 @@ biomarkers:
|
||||
reference:
|
||||
max: 5.0
|
||||
|
||||
- name: "VITAMIN B5/PANTOTHENIC"
|
||||
- name: "VITAMIN B5 (PANTOTHENIC ACID)"
|
||||
test_category: VITAMIN
|
||||
category: vitamins
|
||||
unit: "ng/mL"
|
||||
@@ -649,7 +649,7 @@ biomarkers:
|
||||
min: 11.0
|
||||
max: 150.0
|
||||
|
||||
- name: "VITAMIN B6/P5P"
|
||||
- name: "VITAMIN B6 (PYRIDOXAL-5-PHOSPHATE)"
|
||||
test_category: VITAMIN
|
||||
category: vitamins
|
||||
unit: "ng/mL"
|
||||
@@ -658,7 +658,7 @@ biomarkers:
|
||||
min: 5.0
|
||||
max: 50.0
|
||||
|
||||
- name: "VITAMIN B7/BIOTIN"
|
||||
- name: "VITAMIN B7 (BIOTIN)"
|
||||
test_category: VITAMIN
|
||||
category: vitamins
|
||||
unit: "ng/mL"
|
||||
@@ -667,7 +667,7 @@ biomarkers:
|
||||
min: 0.2
|
||||
max: 3.0
|
||||
|
||||
- name: "VITAMIN B9/FOLIC ACID"
|
||||
- name: "VITAMIN B9 (FOLIC ACID)"
|
||||
test_category: VITAMIN
|
||||
category: vitamins
|
||||
unit: "ng/mL"
|
||||
@@ -676,7 +676,7 @@ biomarkers:
|
||||
min: 0.2
|
||||
max: 20.0
|
||||
|
||||
- name: "VITAMIN B-12"
|
||||
- name: "VITAMIN B12 (COBALAMIN)"
|
||||
test_category: VITAMIN
|
||||
category: vitamins
|
||||
unit: "pg/mL"
|
||||
@@ -951,7 +951,7 @@ biomarkers:
|
||||
- { min: 4, max: 10, label: "Moderate risk of future heart attack" }
|
||||
- { min: 10, label: "Elevated risk of future heart attack" }
|
||||
|
||||
- name: "HS-CRP"
|
||||
- name: "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)"
|
||||
test_category: CARDIAC
|
||||
category: cardiac
|
||||
unit: "mg/L"
|
||||
@@ -970,7 +970,7 @@ biomarkers:
|
||||
reference:
|
||||
max: 30.0
|
||||
|
||||
- name: "LP-PLA2"
|
||||
- name: "LIPOPROTEIN-ASSOCIATED PHOSPHOLIPASE A2 (LP-PLA2)"
|
||||
test_category: CARDIAC
|
||||
category: cardiac
|
||||
unit: "nmol/min/mL"
|
||||
@@ -1062,7 +1062,7 @@ biomarkers:
|
||||
min: 2.6
|
||||
max: 6.0
|
||||
|
||||
- name: "eGFR"
|
||||
- name: "ESTIMATED GLOMERULAR FILTRATION RATE (eGFR)"
|
||||
test_category: RENAL
|
||||
category: renal
|
||||
unit: "mL/min/1.73m²"
|
||||
@@ -1733,7 +1733,7 @@ biomarkers:
|
||||
category: body
|
||||
unit: "cm"
|
||||
|
||||
- name: "BMI"
|
||||
- name: "BODY MASS INDEX (BMI)"
|
||||
test_category: BODY
|
||||
category: body
|
||||
unit: "kg/m²"
|
||||
@@ -1773,7 +1773,7 @@ biomarkers:
|
||||
- { min: 80, max: 89, label: "High Blood Pressure Stage 1" }
|
||||
- { min: 90, label: "High Blood Pressure Stage 2" }
|
||||
|
||||
- name: "SPO2"
|
||||
- name: "OXYGEN SATURATION (SpO2)"
|
||||
test_category: VITALS
|
||||
category: vitals
|
||||
unit: "%"
|
||||
|
||||
@@ -12,6 +12,7 @@ pub struct Config {
|
||||
pub auth: AuthConfig,
|
||||
pub admin: AdminConfig,
|
||||
pub ai: AiConfig,
|
||||
pub mistral: MistralConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
@@ -20,11 +21,12 @@ pub struct ServerConfig {
|
||||
pub port: u16,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct PathsConfig {
|
||||
pub database: String,
|
||||
pub logs: String,
|
||||
pub uploads: String,
|
||||
pub max_upload_mb: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
@@ -53,6 +55,15 @@ pub struct AiConfig {
|
||||
pub api_key: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct MistralConfig {
|
||||
pub api_key: String,
|
||||
pub ocr_model: String,
|
||||
pub max_pages_per_request: u32,
|
||||
pub max_retries: u32,
|
||||
pub timeout_secs: u64,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Load configuration from a YAML file.
|
||||
pub fn load<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
|
||||
|
||||
@@ -5,5 +5,6 @@ pub mod biomarkers;
|
||||
pub mod categories;
|
||||
pub mod diets;
|
||||
pub mod entries;
|
||||
pub mod ocr;
|
||||
pub mod sources;
|
||||
pub mod users;
|
||||
|
||||
180
backend/src/handlers/ocr/matching.rs
Normal file
180
backend/src/handlers/ocr/matching.rs
Normal file
@@ -0,0 +1,180 @@
|
||||
//! Biomarker matching and merging logic.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use strsim::jaro_winkler;
|
||||
|
||||
use super::types::{Biomarker, DocumentAnnotation, OcrResult};
|
||||
|
||||
/// Fuzzy matching threshold (0.0 - 1.0).
|
||||
/// Names with Jaro-Winkler similarity >= this value are considered a match.
|
||||
const FUZZY_THRESHOLD: f64 = 0.90;
|
||||
|
||||
/// Find a matching biomarker name from the valid set.
|
||||
/// Returns the canonical name if found (exact, alias, or fuzzy match).
|
||||
///
|
||||
/// Matching order:
|
||||
/// 1. Exact match on full name
|
||||
/// 2. Extract parenthetical alias from INPUT (e.g., `(HS-CRP)` from `HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)`)
|
||||
/// 3. Extract parenthetical alias from SCHEMA (e.g., `HS-CRP` matches `HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)`)
|
||||
/// 4. Fuzzy match with Jaro-Winkler (threshold 0.90)
|
||||
fn find_matching_biomarker(name: &str, valid_biomarkers: &HashSet<String>) -> Option<String> {
|
||||
let name_upper = name.to_uppercase();
|
||||
|
||||
// 1. Exact match first (fast path)
|
||||
if valid_biomarkers.contains(&name_upper) {
|
||||
return Some(name_upper);
|
||||
}
|
||||
|
||||
// 2. Try extracting parenthetical alias from INPUT
|
||||
if let Some(alias) = extract_parenthetical_alias(&name_upper) {
|
||||
if valid_biomarkers.contains(&alias) {
|
||||
tracing::debug!(
|
||||
"Alias matched '{}' -> '{}' (extracted from parentheses in input)",
|
||||
name, alias
|
||||
);
|
||||
return Some(alias);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Try matching input against aliases in SCHEMA
|
||||
// This handles input "HS-CRP" matching schema "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)"
|
||||
for valid in valid_biomarkers {
|
||||
if let Some(alias) = extract_parenthetical_alias(valid) {
|
||||
if alias == name_upper {
|
||||
tracing::debug!(
|
||||
"Reverse alias matched '{}' -> '{}' (input is alias in schema)",
|
||||
name, valid
|
||||
);
|
||||
return Some(valid.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Fuzzy match with threshold
|
||||
valid_biomarkers.iter()
|
||||
.map(|valid| (valid, jaro_winkler(&name_upper, valid)))
|
||||
.filter(|(_, score)| *score >= FUZZY_THRESHOLD)
|
||||
.max_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
|
||||
.map(|(matched_name, score)| {
|
||||
tracing::debug!(
|
||||
"Fuzzy matched '{}' -> '{}' (score: {:.3})",
|
||||
name, matched_name, score
|
||||
);
|
||||
matched_name.clone()
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract alias from parentheses or brackets at the end of a name.
|
||||
/// Examples:
|
||||
/// - "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)" -> "HS-CRP"
|
||||
/// - "EST. GLOMERULAR FILTRATION RATE (eGFR)" -> "EGFR"
|
||||
/// - "LIPOPROTEIN (A) [LP(A)]" -> None (nested parens too complex)
|
||||
fn extract_parenthetical_alias(name: &str) -> Option<String> {
|
||||
let name = name.trim();
|
||||
|
||||
// Look for trailing (ALIAS) pattern
|
||||
if let Some(start) = name.rfind('(') {
|
||||
if name.ends_with(')') {
|
||||
let alias = &name[start + 1..name.len() - 1];
|
||||
// Only use if it looks like an abbreviation (mostly uppercase, short)
|
||||
if alias.len() >= 2 && alias.len() <= 15 && !alias.contains(' ') {
|
||||
return Some(alias.to_uppercase());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Merge multiple OCR results into one, filtering to only known biomarkers.
|
||||
/// Uses fuzzy matching to handle name variations.
|
||||
pub fn merge_results(results: Vec<DocumentAnnotation>, valid_biomarkers: &HashSet<String>) -> OcrResult {
|
||||
let mut merged = OcrResult {
|
||||
patient_name: None,
|
||||
patient_age: None,
|
||||
patient_gender: None,
|
||||
lab_name: None,
|
||||
test_date: None,
|
||||
biomarkers: Vec::new(),
|
||||
};
|
||||
|
||||
// Track biomarkers by canonical name, prefer ones with actual values
|
||||
let mut biomarker_map: HashMap<String, Biomarker> = HashMap::new();
|
||||
let mut skipped_count = 0;
|
||||
let mut fuzzy_matched_count = 0;
|
||||
|
||||
for result in results {
|
||||
// Take first non-null metadata
|
||||
if merged.patient_name.is_none() && result.patient_name.is_some() {
|
||||
merged.patient_name = result.patient_name;
|
||||
}
|
||||
if merged.patient_age.is_none() && result.patient_age.is_some() {
|
||||
merged.patient_age = result.patient_age;
|
||||
}
|
||||
if merged.patient_gender.is_none() && result.patient_gender.is_some() {
|
||||
merged.patient_gender = result.patient_gender;
|
||||
}
|
||||
if merged.lab_name.is_none() && result.lab_name.is_some() {
|
||||
merged.lab_name = result.lab_name;
|
||||
}
|
||||
if merged.test_date.is_none() && result.test_date.is_some() {
|
||||
merged.test_date = result.test_date;
|
||||
}
|
||||
|
||||
// Merge biomarkers with fuzzy matching
|
||||
if let Some(biomarkers) = result.biomarkers {
|
||||
for mut bm in biomarkers {
|
||||
let original_name = bm.name.clone();
|
||||
|
||||
// Try to find a matching canonical name
|
||||
let canonical_name = match find_matching_biomarker(&bm.name, valid_biomarkers) {
|
||||
Some(matched) => {
|
||||
if matched != bm.name.to_uppercase() {
|
||||
fuzzy_matched_count += 1;
|
||||
}
|
||||
// Update the biomarker name to canonical form
|
||||
bm.name = matched.clone();
|
||||
matched
|
||||
}
|
||||
None => {
|
||||
tracing::debug!("Skipping unknown biomarker: {}", original_name);
|
||||
skipped_count += 1;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let has_real_value = bm.value.is_some() ||
|
||||
bm.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false);
|
||||
|
||||
if let Some(existing) = biomarker_map.get(&canonical_name) {
|
||||
let existing_has_real_value = existing.value.is_some() ||
|
||||
existing.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false);
|
||||
|
||||
// Replace only if current has real value and existing doesn't
|
||||
if has_real_value && !existing_has_real_value {
|
||||
biomarker_map.insert(canonical_name, bm);
|
||||
}
|
||||
} else {
|
||||
biomarker_map.insert(canonical_name, bm);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if skipped_count > 0 {
|
||||
tracing::info!("Skipped {} unknown biomarkers not in schema", skipped_count);
|
||||
}
|
||||
if fuzzy_matched_count > 0 {
|
||||
tracing::info!("Fuzzy matched {} biomarkers to canonical names", fuzzy_matched_count);
|
||||
}
|
||||
|
||||
// Collect biomarkers from map, filtering out "Not Provided" only entries
|
||||
merged.biomarkers = biomarker_map.into_values()
|
||||
.filter(|bm| {
|
||||
bm.value.is_some() ||
|
||||
bm.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false)
|
||||
})
|
||||
.collect();
|
||||
|
||||
merged
|
||||
}
|
||||
211
backend/src/handlers/ocr/mistral.rs
Normal file
211
backend/src/handlers/ocr/mistral.rs
Normal file
@@ -0,0 +1,211 @@
|
||||
//! Mistral API integration for OCR.
|
||||
|
||||
use reqwest::multipart::{Form, Part};
|
||||
use serde_json::{json, Value};
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use tokio::fs;
|
||||
|
||||
use crate::config::MistralConfig;
|
||||
use super::types::{Biomarker, DocumentAnnotation, MistralFileResponse, MistralOcrResponse};
|
||||
use super::schema::strip_descriptions;
|
||||
|
||||
/// Upload a file to Mistral and return the file ID.
|
||||
pub async fn upload_to_mistral(config: &MistralConfig, file_path: &PathBuf) -> Result<String, String> {
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(config.timeout_secs))
|
||||
.build()
|
||||
.map_err(|e| format!("Failed to create HTTP client: {}", e))?;
|
||||
|
||||
let file_bytes = fs::read(file_path)
|
||||
.await
|
||||
.map_err(|e| format!("Failed to read file: {}", e))?;
|
||||
|
||||
let file_name = file_path
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or("document.pdf")
|
||||
.to_string();
|
||||
|
||||
let part = Part::bytes(file_bytes)
|
||||
.file_name(file_name)
|
||||
.mime_str("application/pdf")
|
||||
.map_err(|e| format!("MIME error: {}", e))?;
|
||||
|
||||
let form = Form::new()
|
||||
.text("purpose", "ocr")
|
||||
.part("file", part);
|
||||
|
||||
let response = client
|
||||
.post("https://api.mistral.ai/v1/files")
|
||||
.header("Authorization", format!("Bearer {}", config.api_key))
|
||||
.multipart(form)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("HTTP request failed: {}", e))?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
let error_text = response.text().await.unwrap_or_default();
|
||||
return Err(format!("Mistral upload failed: {}", error_text));
|
||||
}
|
||||
|
||||
let response_text = response.text().await
|
||||
.map_err(|e| format!("Failed to read response: {}", e))?;
|
||||
|
||||
tracing::info!("Mistral file upload response: {}", response_text);
|
||||
|
||||
let result: MistralFileResponse = serde_json::from_str(&response_text)
|
||||
.map_err(|e| format!("Failed to parse response: {} - raw: {}", e, response_text))?;
|
||||
|
||||
tracing::info!("Parsed file upload: id={}, num_pages={:?}", result.id, result.num_pages);
|
||||
|
||||
Ok(result.id)
|
||||
}
|
||||
|
||||
/// Process OCR for specific pages of an uploaded document.
|
||||
pub async fn ocr_pages(
|
||||
config: &MistralConfig,
|
||||
file_id: &str,
|
||||
pages: &[usize],
|
||||
) -> Result<DocumentAnnotation, String> {
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(config.timeout_secs))
|
||||
.build()
|
||||
.map_err(|e| format!("Failed to create HTTP client: {}", e))?;
|
||||
|
||||
// Load the complete schema from file
|
||||
let schema_content = std::fs::read_to_string("ocr_schema.json")
|
||||
.map_err(|e| format!("Failed to read ocr_schema.json: {}", e))?;
|
||||
let mut schema: Value = serde_json::from_str(&schema_content)
|
||||
.map_err(|e| format!("Failed to parse ocr_schema.json: {}", e))?;
|
||||
|
||||
// Clean the schema - remove meta-fields that Mistral echoes back
|
||||
if let Some(obj) = schema.as_object_mut() {
|
||||
obj.remove("$schema");
|
||||
obj.remove("name");
|
||||
obj.remove("description");
|
||||
}
|
||||
strip_descriptions(&mut schema);
|
||||
|
||||
let body = json!({
|
||||
"model": config.ocr_model,
|
||||
"document": {
|
||||
"type": "file",
|
||||
"file_id": file_id
|
||||
},
|
||||
"pages": pages,
|
||||
"document_annotation_format": {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "LabReport",
|
||||
"schema": schema
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let response = client
|
||||
.post("https://api.mistral.ai/v1/ocr")
|
||||
.header("Authorization", format!("Bearer {}", config.api_key))
|
||||
.header("Content-Type", "application/json")
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("OCR request failed: {}", e))?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
let error_text = response.text().await.unwrap_or_default();
|
||||
return Err(format!("OCR failed: {}", error_text));
|
||||
}
|
||||
|
||||
let result: MistralOcrResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(|e| format!("Failed to parse OCR response: {}", e))?;
|
||||
|
||||
let annotation_str = result
|
||||
.document_annotation
|
||||
.ok_or_else(|| "No document annotation in response".to_string())?;
|
||||
|
||||
tracing::debug!("Raw annotation from Mistral: {}", &annotation_str);
|
||||
|
||||
// Mistral returns data wrapped in "properties" - extract it
|
||||
let raw_json: Value = serde_json::from_str(&annotation_str)
|
||||
.map_err(|e| format!("Failed to parse raw JSON: {}", e))?;
|
||||
|
||||
let data_json = if let Some(props) = raw_json.get("properties") {
|
||||
props.clone()
|
||||
} else {
|
||||
raw_json
|
||||
};
|
||||
|
||||
// Check if this is a schema-only response (no actual data)
|
||||
if let Some(biomarkers) = data_json.get("biomarkers") {
|
||||
if biomarkers.get("type").is_some() && biomarkers.get("items").is_some() {
|
||||
tracing::warn!("Skipping schema-only response (no data for these pages)");
|
||||
return Ok(DocumentAnnotation {
|
||||
patient_name: None,
|
||||
patient_age: None,
|
||||
patient_gender: None,
|
||||
lab_name: None,
|
||||
test_date: None,
|
||||
biomarkers: Some(vec![]),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let annotation = parse_annotation(&data_json)?;
|
||||
|
||||
tracing::info!("Parsed annotation: patient={:?}, biomarkers={}",
|
||||
annotation.patient_name,
|
||||
annotation.biomarkers.as_ref().map(|b| b.len()).unwrap_or(0));
|
||||
|
||||
Ok(annotation)
|
||||
}
|
||||
|
||||
/// Parse annotation handling various Mistral response formats.
|
||||
fn parse_annotation(data: &Value) -> Result<DocumentAnnotation, String> {
|
||||
let patient_name = data.get("patient_name").and_then(|v| v.as_str()).map(|s| s.to_string());
|
||||
let patient_age = data.get("patient_age").and_then(|v| v.as_i64()).map(|n| n as i32);
|
||||
let patient_gender = data.get("patient_gender").and_then(|v| v.as_str()).map(|s| s.to_string());
|
||||
let lab_name = data.get("lab_name").and_then(|v| v.as_str()).map(|s| s.to_string());
|
||||
let test_date = data.get("test_date").and_then(|v| v.as_str()).map(|s| s.to_string());
|
||||
|
||||
// Parse biomarkers - handle nested "properties" format
|
||||
let biomarkers = if let Some(bm_array) = data.get("biomarkers").and_then(|v| v.as_array()) {
|
||||
let mut parsed: Vec<Biomarker> = vec![];
|
||||
for item in bm_array {
|
||||
// Try direct format first
|
||||
if let Some(name) = item.get("name").and_then(|v| v.as_str()) {
|
||||
parsed.push(Biomarker {
|
||||
name: name.to_string(),
|
||||
value: item.get("value").and_then(|v| v.as_f64()),
|
||||
value_string: item.get("value_string").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||
unit: item.get("unit").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||
});
|
||||
}
|
||||
// Try nested "properties" format
|
||||
else if let Some(props) = item.get("properties") {
|
||||
if let Some(name) = props.get("name").and_then(|v| v.as_str()) {
|
||||
parsed.push(Biomarker {
|
||||
name: name.to_string(),
|
||||
value: props.get("value").and_then(|v| v.as_f64()),
|
||||
value_string: props.get("value_string").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||
unit: props.get("unit").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(parsed)
|
||||
} else {
|
||||
Some(vec![])
|
||||
};
|
||||
|
||||
Ok(DocumentAnnotation {
|
||||
patient_name,
|
||||
patient_age,
|
||||
patient_gender,
|
||||
lab_name,
|
||||
test_date,
|
||||
biomarkers,
|
||||
})
|
||||
}
|
||||
200
backend/src/handlers/ocr/mod.rs
Normal file
200
backend/src/handlers/ocr/mod.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
//! OCR API handlers - Mistral OCR integration for document parsing.
|
||||
|
||||
mod matching;
|
||||
mod mistral;
|
||||
mod schema;
|
||||
mod types;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use axum::{
|
||||
extract::{Path, State},
|
||||
http::StatusCode,
|
||||
Json,
|
||||
};
|
||||
use sea_orm::{ActiveModelTrait, EntityTrait, Set};
|
||||
|
||||
use crate::models::bio::source;
|
||||
|
||||
// Re-export public types
|
||||
pub use types::{ErrorResponse, OcrState, ParseResponse};
|
||||
|
||||
/// Get page count from a local file.
|
||||
/// For PDFs, uses lopdf to read the actual page count.
|
||||
/// For other file types (images, etc.), returns 1.
|
||||
fn get_page_count(file_path: &PathBuf) -> usize {
|
||||
let extension = file_path.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.unwrap_or("")
|
||||
.to_lowercase();
|
||||
|
||||
if extension == "pdf" {
|
||||
match lopdf::Document::load(file_path) {
|
||||
Ok(doc) => {
|
||||
let count = doc.get_pages().len();
|
||||
tracing::info!("PDF page count (local): {}", count);
|
||||
count
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to read PDF page count: {}, defaulting to 1", e);
|
||||
1
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tracing::info!("Non-PDF file, treating as 1 page");
|
||||
1
|
||||
}
|
||||
}
|
||||
|
||||
/// POST /api/sources/:id/parse - Parse a source document using Mistral OCR.
|
||||
pub async fn parse_source(
|
||||
State(state): State<OcrState>,
|
||||
Path(id): Path<i32>,
|
||||
) -> Result<Json<ParseResponse>, (StatusCode, Json<ErrorResponse>)> {
|
||||
// 1. Get source from database
|
||||
let source_entity = source::Entity::find_by_id(id)
|
||||
.one(&state.db)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("Database error: {}", e),
|
||||
}),
|
||||
)
|
||||
})?
|
||||
.ok_or_else(|| {
|
||||
(
|
||||
StatusCode::NOT_FOUND,
|
||||
Json(ErrorResponse {
|
||||
error: "Source not found".to_string(),
|
||||
}),
|
||||
)
|
||||
})?;
|
||||
|
||||
let file_path = PathBuf::from(&source_entity.file_path);
|
||||
|
||||
// 2. Upload file to Mistral
|
||||
let file_id = mistral::upload_to_mistral(&state.mistral, &file_path).await.map_err(|e| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("Mistral upload failed: {}", e),
|
||||
}),
|
||||
)
|
||||
})?;
|
||||
|
||||
// 3. Get page count locally from PDF
|
||||
let max_pages = get_page_count(&file_path);
|
||||
let chunk_size = state.mistral.max_pages_per_request as usize;
|
||||
let max_retries = state.mistral.max_retries;
|
||||
let mut all_results: Vec<types::DocumentAnnotation> = Vec::new();
|
||||
let mut failed_chunk: Option<String> = None;
|
||||
|
||||
for start_page in (0..max_pages).step_by(chunk_size) {
|
||||
let pages: Vec<usize> = (start_page..std::cmp::min(start_page + chunk_size, max_pages)).collect();
|
||||
|
||||
tracing::info!("Processing OCR for pages {:?}", pages);
|
||||
|
||||
// Retry loop for this chunk
|
||||
let mut attempts = 0;
|
||||
let mut chunk_result = None;
|
||||
|
||||
while attempts <= max_retries {
|
||||
match mistral::ocr_pages(&state.mistral, &file_id, &pages).await {
|
||||
Ok(annotation) => {
|
||||
chunk_result = Some(annotation);
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
if e.contains("out of range") || e.contains("no pages") || e.contains("Invalid page") {
|
||||
tracing::info!("Reached end of document at pages {:?}", pages);
|
||||
break;
|
||||
}
|
||||
|
||||
attempts += 1;
|
||||
if attempts <= max_retries {
|
||||
tracing::warn!("OCR chunk error (pages {:?}), attempt {}/{}: {}", pages, attempts, max_retries + 1, e);
|
||||
} else {
|
||||
tracing::error!("OCR chunk failed after {} attempts (pages {:?}): {}", max_retries + 1, pages, e);
|
||||
failed_chunk = Some(format!("Pages {:?}: {}", pages, e));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(annotation) = chunk_result {
|
||||
all_results.push(annotation);
|
||||
} else if failed_chunk.is_some() {
|
||||
break;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Fail if any chunk failed
|
||||
if let Some(error_msg) = failed_chunk {
|
||||
return Err((
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("OCR parsing failed: {}", error_msg),
|
||||
}),
|
||||
));
|
||||
}
|
||||
|
||||
if all_results.is_empty() {
|
||||
return Err((
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: "No OCR results obtained".to_string(),
|
||||
}),
|
||||
));
|
||||
}
|
||||
|
||||
// 4. Get valid biomarker names from schema
|
||||
let valid_biomarkers = schema::extract_valid_biomarker_names().map_err(|e| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("Failed to read schema: {}", e),
|
||||
}),
|
||||
)
|
||||
})?;
|
||||
|
||||
tracing::info!("Loaded {} valid biomarker names from schema", valid_biomarkers.len());
|
||||
|
||||
// 5. Merge results with fuzzy matching
|
||||
let merged = matching::merge_results(all_results, &valid_biomarkers);
|
||||
|
||||
// 6. Save to database
|
||||
let ocr_json = serde_json::to_string(&merged).map_err(|e| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("JSON serialization failed: {}", e),
|
||||
}),
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut active_model: source::ActiveModel = source_entity.into();
|
||||
active_model.ocr_data = Set(Some(ocr_json));
|
||||
|
||||
active_model.update(&state.db).await.map_err(|e| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: format!("Database update failed: {}", e),
|
||||
}),
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(Json(ParseResponse {
|
||||
success: true,
|
||||
biomarkers_count: merged.biomarkers.len(),
|
||||
message: format!(
|
||||
"Successfully parsed {} biomarkers for {}",
|
||||
merged.biomarkers.len(),
|
||||
merged.patient_name.unwrap_or_else(|| "Unknown".to_string())
|
||||
),
|
||||
}))
|
||||
}
|
||||
49
backend/src/handlers/ocr/schema.rs
Normal file
49
backend/src/handlers/ocr/schema.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
//! Schema handling utilities.
|
||||
|
||||
use serde_json::Value;
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Extract valid biomarker names from the ocr_schema.json enum.
|
||||
pub fn extract_valid_biomarker_names() -> Result<HashSet<String>, String> {
|
||||
let schema_content = std::fs::read_to_string("ocr_schema.json")
|
||||
.map_err(|e| format!("Failed to read ocr_schema.json: {}", e))?;
|
||||
let schema: Value = serde_json::from_str(&schema_content)
|
||||
.map_err(|e| format!("Failed to parse ocr_schema.json: {}", e))?;
|
||||
|
||||
// Navigate to: properties.biomarkers.items.properties.name.enum
|
||||
let names = schema
|
||||
.get("properties")
|
||||
.and_then(|p| p.get("biomarkers"))
|
||||
.and_then(|b| b.get("items"))
|
||||
.and_then(|i| i.get("properties"))
|
||||
.and_then(|p| p.get("name"))
|
||||
.and_then(|n| n.get("enum"))
|
||||
.and_then(|e| e.as_array())
|
||||
.ok_or_else(|| "Could not find biomarker name enum in schema".to_string())?;
|
||||
|
||||
let valid_names: HashSet<String> = names
|
||||
.iter()
|
||||
.filter_map(|v| v.as_str())
|
||||
.map(|s| s.to_uppercase())
|
||||
.collect();
|
||||
|
||||
Ok(valid_names)
|
||||
}
|
||||
|
||||
/// Recursively remove "description" fields from a JSON value.
|
||||
pub fn strip_descriptions(value: &mut Value) {
|
||||
match value {
|
||||
Value::Object(map) => {
|
||||
map.remove("description");
|
||||
for (_, v) in map.iter_mut() {
|
||||
strip_descriptions(v);
|
||||
}
|
||||
}
|
||||
Value::Array(arr) => {
|
||||
for v in arr.iter_mut() {
|
||||
strip_descriptions(v);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
77
backend/src/handlers/ocr/types.rs
Normal file
77
backend/src/handlers/ocr/types.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
//! Type definitions for OCR module.
|
||||
|
||||
use sea_orm::DatabaseConnection;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::config::MistralConfig;
|
||||
|
||||
/// State for OCR handlers.
|
||||
#[derive(Clone)]
|
||||
pub struct OcrState {
|
||||
pub db: DatabaseConnection,
|
||||
pub uploads_path: PathBuf,
|
||||
pub mistral: MistralConfig,
|
||||
}
|
||||
|
||||
/// Response for parse endpoint.
|
||||
#[derive(Serialize)]
|
||||
pub struct ParseResponse {
|
||||
pub success: bool,
|
||||
pub biomarkers_count: usize,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
/// Error response.
|
||||
#[derive(Serialize)]
|
||||
pub struct ErrorResponse {
|
||||
pub error: String,
|
||||
}
|
||||
|
||||
/// Mistral file upload response.
|
||||
#[derive(Deserialize)]
|
||||
pub struct MistralFileResponse {
|
||||
pub id: String,
|
||||
#[allow(dead_code)]
|
||||
pub bytes: i64,
|
||||
pub num_pages: Option<usize>,
|
||||
}
|
||||
|
||||
/// Mistral OCR response.
|
||||
#[derive(Deserialize)]
|
||||
pub struct MistralOcrResponse {
|
||||
pub document_annotation: Option<String>,
|
||||
#[allow(dead_code)]
|
||||
pub pages: Option<Vec<serde_json::Value>>,
|
||||
}
|
||||
|
||||
/// Extracted biomarker from OCR.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Biomarker {
|
||||
pub name: String,
|
||||
pub value: Option<f64>,
|
||||
pub value_string: Option<String>,
|
||||
pub unit: Option<String>,
|
||||
}
|
||||
|
||||
/// Merged OCR result.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct OcrResult {
|
||||
pub patient_name: Option<String>,
|
||||
pub patient_age: Option<i32>,
|
||||
pub patient_gender: Option<String>,
|
||||
pub lab_name: Option<String>,
|
||||
pub test_date: Option<String>,
|
||||
pub biomarkers: Vec<Biomarker>,
|
||||
}
|
||||
|
||||
/// Document annotation from Mistral.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct DocumentAnnotation {
|
||||
pub patient_name: Option<String>,
|
||||
pub patient_age: Option<i32>,
|
||||
pub patient_gender: Option<String>,
|
||||
pub lab_name: Option<String>,
|
||||
pub test_date: Option<String>,
|
||||
pub biomarkers: Option<Vec<Biomarker>>,
|
||||
}
|
||||
@@ -156,13 +156,26 @@ fn create_router(db: DatabaseConnection, config: &config::Config) -> Router {
|
||||
.route("/api/sources/{id}", get(handlers::sources::get_source)
|
||||
.delete(handlers::sources::delete_source))
|
||||
.route("/api/sources/{id}/ocr", put(handlers::sources::update_ocr))
|
||||
.layer(axum::extract::DefaultBodyLimit::max(config.paths.max_upload_mb as usize * 1024 * 1024))
|
||||
.route_layer(middleware::from_fn(require_auth))
|
||||
.with_state(sources_state);
|
||||
|
||||
// OCR routes (need Mistral config)
|
||||
let ocr_state = handlers::ocr::OcrState {
|
||||
db: db.clone(),
|
||||
uploads_path: PathBuf::from(&config.paths.uploads),
|
||||
mistral: config.mistral.clone(),
|
||||
};
|
||||
let ocr_routes = Router::new()
|
||||
.route("/api/sources/{id}/parse", post(handlers::ocr::parse_source))
|
||||
.route_layer(middleware::from_fn(require_auth))
|
||||
.with_state(ocr_state);
|
||||
|
||||
Router::new()
|
||||
.merge(public_routes)
|
||||
.merge(protected_routes)
|
||||
.merge(sources_routes)
|
||||
.merge(ocr_routes)
|
||||
.layer(auth_layer)
|
||||
.with_state(db)
|
||||
}
|
||||
@@ -185,10 +198,18 @@ async fn require_auth(
|
||||
}
|
||||
|
||||
fn init_logging(config: &config::Config) {
|
||||
let log_level = config.logging.level.parse().unwrap_or(tracing::Level::INFO);
|
||||
// Build filter: use configured level for our code, but restrict sqlx/sea_orm
|
||||
let filter_str = format!(
|
||||
"{},sqlx=warn,sea_orm=warn",
|
||||
config.logging.level
|
||||
);
|
||||
|
||||
let filter = tracing_subscriber::filter::EnvFilter::try_new(&filter_str)
|
||||
.unwrap_or_else(|_| tracing_subscriber::filter::EnvFilter::new("info,sqlx=warn,sea_orm=warn"));
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(tracing_subscriber::fmt::layer())
|
||||
.with(tracing_subscriber::filter::LevelFilter::from_level(log_level))
|
||||
.with(filter)
|
||||
.init();
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ export function SourcesPage() {
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
const [dragOver, setDragOver] = useState(false)
|
||||
const [deleteConfirmId, setDeleteConfirmId] = useState<number | null>(null)
|
||||
const [parsingId, setParsingId] = useState<number | null>(null)
|
||||
const fileInputRef = useRef<HTMLInputElement>(null)
|
||||
|
||||
// Fetch sources on mount
|
||||
@@ -98,6 +99,31 @@ export function SourcesPage() {
|
||||
}
|
||||
}
|
||||
|
||||
const handleParse = async (id: number) => {
|
||||
setParsingId(id)
|
||||
setError(null)
|
||||
try {
|
||||
const res = await fetch(`/api/sources/${id}/parse`, {
|
||||
method: 'POST',
|
||||
credentials: 'include',
|
||||
})
|
||||
if (res.ok) {
|
||||
const data = await res.json()
|
||||
// Refresh sources to show updated status
|
||||
fetchSources()
|
||||
console.log('Parsed:', data)
|
||||
} else {
|
||||
const err = await res.json()
|
||||
setError(err.error || 'Parse failed')
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Failed to parse:', e)
|
||||
setError('Failed to parse document')
|
||||
} finally {
|
||||
setParsingId(null)
|
||||
}
|
||||
}
|
||||
|
||||
const formatFileSize = (bytes: number) => {
|
||||
if (bytes < 1024) return `${bytes} B`
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`
|
||||
@@ -196,7 +222,13 @@ export function SourcesPage() {
|
||||
<img src="/icons/general/icons8-checkmark-50.png" alt="Parsed" className="icon-sm" /> Parsed
|
||||
</span>
|
||||
) : (
|
||||
<span className="text-secondary text-xs">Pending</span>
|
||||
<button
|
||||
className="btn btn-primary btn-sm"
|
||||
onClick={() => handleParse(source.id)}
|
||||
disabled={parsingId === source.id}
|
||||
>
|
||||
{parsingId === source.id ? 'Parsing...' : 'Parse'}
|
||||
</button>
|
||||
)}
|
||||
<button
|
||||
className="btn btn-danger btn-sm"
|
||||
|
||||
Reference in New Issue
Block a user