feat: implement Mistral OCR document parsing with fuzzy matching and frontend integration
This commit is contained in:
@@ -47,3 +47,9 @@ regex = "1"
|
|||||||
|
|
||||||
# CLI
|
# CLI
|
||||||
argh = "0.1"
|
argh = "0.1"
|
||||||
|
reqwest = { version = "0.12.26", features = ["multipart", "json"] }
|
||||||
|
serde_json = "1.0.145"
|
||||||
|
|
||||||
|
# PDF parsing for page count
|
||||||
|
lopdf = "0.36"
|
||||||
|
strsim = "0.11"
|
||||||
|
|||||||
245
backend/ocr_schema.json
Normal file
245
backend/ocr_schema.json
Normal file
@@ -0,0 +1,245 @@
|
|||||||
|
{
|
||||||
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||||
|
"name": "LabReport",
|
||||||
|
"description": "Extract biomarker data from a medical lab report",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"patient_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Full name of the patient"
|
||||||
|
},
|
||||||
|
"patient_age": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Age of the patient in years"
|
||||||
|
},
|
||||||
|
"patient_gender": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"male",
|
||||||
|
"female",
|
||||||
|
"other"
|
||||||
|
],
|
||||||
|
"description": "Gender of the patient"
|
||||||
|
},
|
||||||
|
"lab_name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Name of the laboratory"
|
||||||
|
},
|
||||||
|
"test_date": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Date when the sample was collected (YYYY-MM-DD format if possible)"
|
||||||
|
},
|
||||||
|
"report_id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Report ID, barcode, or reference number"
|
||||||
|
},
|
||||||
|
"biomarkers": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "List of biomarker test results",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Name of the biomarker/test",
|
||||||
|
"enum": [
|
||||||
|
"ARSENIC",
|
||||||
|
"CADMIUM",
|
||||||
|
"MERCURY",
|
||||||
|
"LEAD",
|
||||||
|
"CHROMIUM",
|
||||||
|
"BARIUM",
|
||||||
|
"COBALT",
|
||||||
|
"CAESIUM",
|
||||||
|
"THALLIUM",
|
||||||
|
"URANIUM",
|
||||||
|
"STRONTIUM",
|
||||||
|
"ANTIMONY",
|
||||||
|
"TIN",
|
||||||
|
"MOLYBDENUM",
|
||||||
|
"SILVER",
|
||||||
|
"VANADIUM",
|
||||||
|
"BERYLLIUM",
|
||||||
|
"BISMUTH",
|
||||||
|
"SELENIUM",
|
||||||
|
"ALUMINIUM",
|
||||||
|
"NICKEL",
|
||||||
|
"MANGANESE",
|
||||||
|
"GLYCOSYLATED HEMOGLOBIN (HbA1c)",
|
||||||
|
"AVERAGE BLOOD GLUCOSE (ABG)",
|
||||||
|
"FASTING BLOOD SUGAR (GLUCOSE)",
|
||||||
|
"INSULIN FASTING",
|
||||||
|
"FRUCTOSAMINE",
|
||||||
|
"BLOOD KETONE (D3HB)",
|
||||||
|
"ERYTHROCYTE SEDIMENTATION RATE (ESR)",
|
||||||
|
"ANTI CCP (ACCP)",
|
||||||
|
"ANTI NUCLEAR ANTIBODIES (ANA)",
|
||||||
|
"HEMOGLOBIN",
|
||||||
|
"HEMATOCRIT (PCV)",
|
||||||
|
"TOTAL RED BLOOD CELL COUNT (RBC)",
|
||||||
|
"MEAN CORPUSCULAR VOLUME (MCV)",
|
||||||
|
"MEAN CORPUSCULAR HEMOGLOBIN (MCH)",
|
||||||
|
"MEAN CORP. HEMO. CONC (MCHC)",
|
||||||
|
"RED CELL DISTRIBUTION WIDTH - SD (RDW-SD)",
|
||||||
|
"RED CELL DISTRIBUTION WIDTH (RDW-CV)",
|
||||||
|
"TOTAL LEUCOCYTE COUNT (WBC)",
|
||||||
|
"NEUTROPHILS PERCENTAGE",
|
||||||
|
"LYMPHOCYTES PERCENTAGE",
|
||||||
|
"MONOCYTES PERCENTAGE",
|
||||||
|
"EOSINOPHILS PERCENTAGE",
|
||||||
|
"BASOPHILS PERCENTAGE",
|
||||||
|
"IMMATURE GRANULOCYTE PERCENTAGE (IG%)",
|
||||||
|
"NUCLEATED RED BLOOD CELLS %",
|
||||||
|
"NEUTROPHILS ABSOLUTE COUNT",
|
||||||
|
"LYMPHOCYTES ABSOLUTE COUNT",
|
||||||
|
"MONOCYTES - ABSOLUTE COUNT",
|
||||||
|
"BASOPHILS ABSOLUTE COUNT",
|
||||||
|
"EOSINOPHILS ABSOLUTE COUNT",
|
||||||
|
"IMMATURE GRANULOCYTES (IG)",
|
||||||
|
"NUCLEATED RED BLOOD CELLS",
|
||||||
|
"PLATELET COUNT",
|
||||||
|
"MEAN PLATELET VOLUME (MPV)",
|
||||||
|
"PLATELET DISTRIBUTION WIDTH (PDW)",
|
||||||
|
"PLATELET TO LARGE CELL RATIO (PLCR)",
|
||||||
|
"PLATELETCRIT (PCT)",
|
||||||
|
"VITAMIN A",
|
||||||
|
"VITAMIN E",
|
||||||
|
"VITAMIN K",
|
||||||
|
"VITAMIN B1 (THIAMIN)",
|
||||||
|
"VITAMIN B2 (RIBOFLAVIN)",
|
||||||
|
"VITAMIN B3 (NIACIN/NICOTINIC ACID)",
|
||||||
|
"VITAMIN B5 (PANTOTHENIC ACID)",
|
||||||
|
"VITAMIN B6 (PYRIDOXAL-5-PHOSPHATE)",
|
||||||
|
"VITAMIN B7 (BIOTIN)",
|
||||||
|
"VITAMIN B9 (FOLIC ACID)",
|
||||||
|
"VITAMIN B12 (COBALAMIN)",
|
||||||
|
"VITAMIN D TOTAL",
|
||||||
|
"VITAMIN D2",
|
||||||
|
"VITAMIN D3",
|
||||||
|
"CORTISOL",
|
||||||
|
"CORTICOSTERONE",
|
||||||
|
"ANDROSTENEDIONE",
|
||||||
|
"ESTRADIOL",
|
||||||
|
"TESTOSTERONE",
|
||||||
|
"PROGESTERONE",
|
||||||
|
"17-HYDROXYPROGESTERONE",
|
||||||
|
"DEHYDROEPIANDROSTERONE (DHEA)",
|
||||||
|
"DHEA - SULPHATE (DHEAS)",
|
||||||
|
"DEOXYCORTISOL",
|
||||||
|
"ALPHA-1-ANTITRYPSIN (AAT)",
|
||||||
|
"HOMOCYSTEINE",
|
||||||
|
"TROPONIN I",
|
||||||
|
"HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)",
|
||||||
|
"LIPOPROTEIN (A) [Lp(a)]",
|
||||||
|
"LIPOPROTEIN-ASSOCIATED PHOSPHOLIPASE A2 (LP-PLA2)",
|
||||||
|
"CYSTATIN C",
|
||||||
|
"BLOOD UREA NITROGEN (BUN)",
|
||||||
|
"UREA (CALCULATED)",
|
||||||
|
"CREATININE - SERUM",
|
||||||
|
"UREA / SR.CREATININE RATIO",
|
||||||
|
"BUN / SR.CREATININE RATIO",
|
||||||
|
"CALCIUM",
|
||||||
|
"URIC ACID",
|
||||||
|
"ESTIMATED GLOMERULAR FILTRATION RATE (eGFR)",
|
||||||
|
"TOTAL CHOLESTEROL",
|
||||||
|
"HDL CHOLESTEROL - DIRECT",
|
||||||
|
"LDL CHOLESTEROL - DIRECT",
|
||||||
|
"TRIGLYCERIDES",
|
||||||
|
"VLDL CHOLESTEROL",
|
||||||
|
"NON-HDL CHOLESTEROL",
|
||||||
|
"TC / HDL CHOLESTEROL RATIO",
|
||||||
|
"LDL / HDL RATIO",
|
||||||
|
"HDL / LDL RATIO",
|
||||||
|
"TRIG / HDL RATIO",
|
||||||
|
"APOLIPOPROTEIN - A1 (APO-A1)",
|
||||||
|
"APOLIPOPROTEIN - B (APO-B)",
|
||||||
|
"APO B / APO A1 RATIO",
|
||||||
|
"IRON",
|
||||||
|
"TOTAL IRON BINDING CAPACITY (TIBC)",
|
||||||
|
"% TRANSFERRIN SATURATION",
|
||||||
|
"FERRITIN",
|
||||||
|
"UNSAT. IRON-BINDING CAPACITY (UIBC)",
|
||||||
|
"ALKALINE PHOSPHATASE",
|
||||||
|
"BILIRUBIN - TOTAL",
|
||||||
|
"BILIRUBIN - DIRECT",
|
||||||
|
"BILIRUBIN (INDIRECT)",
|
||||||
|
"GAMMA GLUTAMYL TRANSFERASE (GGT)",
|
||||||
|
"ASPARTATE AMINOTRANSFERASE (SGOT)",
|
||||||
|
"ALANINE TRANSAMINASE (SGPT)",
|
||||||
|
"SGOT / SGPT RATIO",
|
||||||
|
"PROTEIN - TOTAL",
|
||||||
|
"ALBUMIN - SERUM",
|
||||||
|
"SERUM GLOBULIN",
|
||||||
|
"SERUM ALB/GLOBULIN RATIO",
|
||||||
|
"SODIUM",
|
||||||
|
"POTASSIUM",
|
||||||
|
"CHLORIDE",
|
||||||
|
"MAGNESIUM",
|
||||||
|
"TOTAL TRIIODOTHYRONINE (T3)",
|
||||||
|
"TOTAL THYROXINE (T4)",
|
||||||
|
"TSH ULTRASENSITIVE",
|
||||||
|
"SERUM COPPER",
|
||||||
|
"SERUM ZINC",
|
||||||
|
"AMYLASE",
|
||||||
|
"LIPASE",
|
||||||
|
"URINARY MICROALBUMIN",
|
||||||
|
"CREATININE - URINE",
|
||||||
|
"URI. ALBUMIN/CREATININE RATIO",
|
||||||
|
"URINE COLOUR",
|
||||||
|
"URINE APPEARANCE",
|
||||||
|
"URINE SPECIFIC GRAVITY",
|
||||||
|
"URINE PH",
|
||||||
|
"URINARY PROTEIN",
|
||||||
|
"URINARY GLUCOSE",
|
||||||
|
"URINE KETONE",
|
||||||
|
"URINARY BILIRUBIN",
|
||||||
|
"UROBILINOGEN",
|
||||||
|
"BILE SALT",
|
||||||
|
"BILE PIGMENT",
|
||||||
|
"URINE BLOOD",
|
||||||
|
"NITRITE",
|
||||||
|
"LEUCOCYTE ESTERASE",
|
||||||
|
"MUCUS",
|
||||||
|
"URINE RBC",
|
||||||
|
"URINARY LEUCOCYTES (PUS CELLS)",
|
||||||
|
"EPITHELIAL CELLS",
|
||||||
|
"CASTS",
|
||||||
|
"CRYSTALS",
|
||||||
|
"BACTERIA",
|
||||||
|
"YEAST",
|
||||||
|
"PARASITE",
|
||||||
|
"WEIGHT",
|
||||||
|
"HEIGHT",
|
||||||
|
"BODY MASS INDEX (BMI)",
|
||||||
|
"HEART RATE",
|
||||||
|
"BLOOD PRESSURE SYSTOLIC",
|
||||||
|
"BLOOD PRESSURE DIASTOLIC",
|
||||||
|
"OXYGEN SATURATION (SpO2)",
|
||||||
|
"BODY TEMPERATURE",
|
||||||
|
"STEPS",
|
||||||
|
"CALORIES BURNED"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"value": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "Observed/measured value"
|
||||||
|
},
|
||||||
|
"value_string": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Value as string if non-numeric (e.g., 'Negative', 'Trace', '> 65')"
|
||||||
|
},
|
||||||
|
"unit": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Unit of measurement"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"name"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"biomarkers"
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -10,6 +10,7 @@ paths:
|
|||||||
database: "./data/zhealth.db"
|
database: "./data/zhealth.db"
|
||||||
logs: "./logs"
|
logs: "./logs"
|
||||||
uploads: "./data/uploads"
|
uploads: "./data/uploads"
|
||||||
|
max_upload_mb: 50 # Maximum file upload size in MB
|
||||||
|
|
||||||
logging:
|
logging:
|
||||||
level: "info" # Options: trace | debug | info | warn | error
|
level: "info" # Options: trace | debug | info | warn | error
|
||||||
@@ -29,3 +30,12 @@ ai:
|
|||||||
provider: "gemini" # Options: gemini | openai | anthropic
|
provider: "gemini" # Options: gemini | openai | anthropic
|
||||||
model: "gemini-3-flash-preview"
|
model: "gemini-3-flash-preview"
|
||||||
api_key: "${AI_API_KEY}"
|
api_key: "${AI_API_KEY}"
|
||||||
|
|
||||||
|
# Mistral OCR for document parsing
|
||||||
|
mistral:
|
||||||
|
api_key: "${MISTRAL_API_KEY}"
|
||||||
|
ocr_model: "mistral-ocr-latest"
|
||||||
|
max_pages_per_request: 8
|
||||||
|
max_retries: 2 # Max retry attempts per chunk
|
||||||
|
timeout_secs: 120 # Request timeout in seconds
|
||||||
|
|
||||||
|
|||||||
@@ -214,7 +214,7 @@ biomarkers:
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
# DIABETES / METABOLIC - Scale-based interpretations
|
# DIABETES / METABOLIC - Scale-based interpretations
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
- name: "HbA1c"
|
- name: "GLYCOSYLATED HEMOGLOBIN (HbA1c)"
|
||||||
test_category: DIABETES
|
test_category: DIABETES
|
||||||
category: metabolic
|
category: metabolic
|
||||||
unit: "%"
|
unit: "%"
|
||||||
@@ -339,7 +339,7 @@ biomarkers:
|
|||||||
min: 36.0
|
min: 36.0
|
||||||
max: 44.0
|
max: 44.0
|
||||||
|
|
||||||
- name: "TOTAL RBC"
|
- name: "TOTAL RED BLOOD CELL COUNT (RBC)"
|
||||||
test_category: HEMOGRAM
|
test_category: HEMOGRAM
|
||||||
category: blood
|
category: blood
|
||||||
unit: "10^6/µL"
|
unit: "10^6/µL"
|
||||||
@@ -614,7 +614,7 @@ biomarkers:
|
|||||||
min: 0.13
|
min: 0.13
|
||||||
max: 1.19
|
max: 1.19
|
||||||
|
|
||||||
- name: "VITAMIN B1/THIAMIN"
|
- name: "VITAMIN B1 (THIAMIN)"
|
||||||
test_category: VITAMIN
|
test_category: VITAMIN
|
||||||
category: vitamins
|
category: vitamins
|
||||||
unit: "ng/mL"
|
unit: "ng/mL"
|
||||||
@@ -623,7 +623,7 @@ biomarkers:
|
|||||||
min: 0.5
|
min: 0.5
|
||||||
max: 4.0
|
max: 4.0
|
||||||
|
|
||||||
- name: "VITAMIN B2/RIBOFLAVIN"
|
- name: "VITAMIN B2 (RIBOFLAVIN)"
|
||||||
test_category: VITAMIN
|
test_category: VITAMIN
|
||||||
category: vitamins
|
category: vitamins
|
||||||
unit: "ng/mL"
|
unit: "ng/mL"
|
||||||
@@ -632,7 +632,7 @@ biomarkers:
|
|||||||
min: 1.6
|
min: 1.6
|
||||||
max: 68.2
|
max: 68.2
|
||||||
|
|
||||||
- name: "VITAMIN B3/NICOTINIC ACID"
|
- name: "VITAMIN B3 (NIACIN/NICOTINIC ACID)"
|
||||||
test_category: VITAMIN
|
test_category: VITAMIN
|
||||||
category: vitamins
|
category: vitamins
|
||||||
unit: "ng/mL"
|
unit: "ng/mL"
|
||||||
@@ -640,7 +640,7 @@ biomarkers:
|
|||||||
reference:
|
reference:
|
||||||
max: 5.0
|
max: 5.0
|
||||||
|
|
||||||
- name: "VITAMIN B5/PANTOTHENIC"
|
- name: "VITAMIN B5 (PANTOTHENIC ACID)"
|
||||||
test_category: VITAMIN
|
test_category: VITAMIN
|
||||||
category: vitamins
|
category: vitamins
|
||||||
unit: "ng/mL"
|
unit: "ng/mL"
|
||||||
@@ -649,7 +649,7 @@ biomarkers:
|
|||||||
min: 11.0
|
min: 11.0
|
||||||
max: 150.0
|
max: 150.0
|
||||||
|
|
||||||
- name: "VITAMIN B6/P5P"
|
- name: "VITAMIN B6 (PYRIDOXAL-5-PHOSPHATE)"
|
||||||
test_category: VITAMIN
|
test_category: VITAMIN
|
||||||
category: vitamins
|
category: vitamins
|
||||||
unit: "ng/mL"
|
unit: "ng/mL"
|
||||||
@@ -658,7 +658,7 @@ biomarkers:
|
|||||||
min: 5.0
|
min: 5.0
|
||||||
max: 50.0
|
max: 50.0
|
||||||
|
|
||||||
- name: "VITAMIN B7/BIOTIN"
|
- name: "VITAMIN B7 (BIOTIN)"
|
||||||
test_category: VITAMIN
|
test_category: VITAMIN
|
||||||
category: vitamins
|
category: vitamins
|
||||||
unit: "ng/mL"
|
unit: "ng/mL"
|
||||||
@@ -667,7 +667,7 @@ biomarkers:
|
|||||||
min: 0.2
|
min: 0.2
|
||||||
max: 3.0
|
max: 3.0
|
||||||
|
|
||||||
- name: "VITAMIN B9/FOLIC ACID"
|
- name: "VITAMIN B9 (FOLIC ACID)"
|
||||||
test_category: VITAMIN
|
test_category: VITAMIN
|
||||||
category: vitamins
|
category: vitamins
|
||||||
unit: "ng/mL"
|
unit: "ng/mL"
|
||||||
@@ -676,7 +676,7 @@ biomarkers:
|
|||||||
min: 0.2
|
min: 0.2
|
||||||
max: 20.0
|
max: 20.0
|
||||||
|
|
||||||
- name: "VITAMIN B-12"
|
- name: "VITAMIN B12 (COBALAMIN)"
|
||||||
test_category: VITAMIN
|
test_category: VITAMIN
|
||||||
category: vitamins
|
category: vitamins
|
||||||
unit: "pg/mL"
|
unit: "pg/mL"
|
||||||
@@ -951,7 +951,7 @@ biomarkers:
|
|||||||
- { min: 4, max: 10, label: "Moderate risk of future heart attack" }
|
- { min: 4, max: 10, label: "Moderate risk of future heart attack" }
|
||||||
- { min: 10, label: "Elevated risk of future heart attack" }
|
- { min: 10, label: "Elevated risk of future heart attack" }
|
||||||
|
|
||||||
- name: "HS-CRP"
|
- name: "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)"
|
||||||
test_category: CARDIAC
|
test_category: CARDIAC
|
||||||
category: cardiac
|
category: cardiac
|
||||||
unit: "mg/L"
|
unit: "mg/L"
|
||||||
@@ -970,7 +970,7 @@ biomarkers:
|
|||||||
reference:
|
reference:
|
||||||
max: 30.0
|
max: 30.0
|
||||||
|
|
||||||
- name: "LP-PLA2"
|
- name: "LIPOPROTEIN-ASSOCIATED PHOSPHOLIPASE A2 (LP-PLA2)"
|
||||||
test_category: CARDIAC
|
test_category: CARDIAC
|
||||||
category: cardiac
|
category: cardiac
|
||||||
unit: "nmol/min/mL"
|
unit: "nmol/min/mL"
|
||||||
@@ -1062,7 +1062,7 @@ biomarkers:
|
|||||||
min: 2.6
|
min: 2.6
|
||||||
max: 6.0
|
max: 6.0
|
||||||
|
|
||||||
- name: "eGFR"
|
- name: "ESTIMATED GLOMERULAR FILTRATION RATE (eGFR)"
|
||||||
test_category: RENAL
|
test_category: RENAL
|
||||||
category: renal
|
category: renal
|
||||||
unit: "mL/min/1.73m²"
|
unit: "mL/min/1.73m²"
|
||||||
@@ -1733,7 +1733,7 @@ biomarkers:
|
|||||||
category: body
|
category: body
|
||||||
unit: "cm"
|
unit: "cm"
|
||||||
|
|
||||||
- name: "BMI"
|
- name: "BODY MASS INDEX (BMI)"
|
||||||
test_category: BODY
|
test_category: BODY
|
||||||
category: body
|
category: body
|
||||||
unit: "kg/m²"
|
unit: "kg/m²"
|
||||||
@@ -1773,7 +1773,7 @@ biomarkers:
|
|||||||
- { min: 80, max: 89, label: "High Blood Pressure Stage 1" }
|
- { min: 80, max: 89, label: "High Blood Pressure Stage 1" }
|
||||||
- { min: 90, label: "High Blood Pressure Stage 2" }
|
- { min: 90, label: "High Blood Pressure Stage 2" }
|
||||||
|
|
||||||
- name: "SPO2"
|
- name: "OXYGEN SATURATION (SpO2)"
|
||||||
test_category: VITALS
|
test_category: VITALS
|
||||||
category: vitals
|
category: vitals
|
||||||
unit: "%"
|
unit: "%"
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ pub struct Config {
|
|||||||
pub auth: AuthConfig,
|
pub auth: AuthConfig,
|
||||||
pub admin: AdminConfig,
|
pub admin: AdminConfig,
|
||||||
pub ai: AiConfig,
|
pub ai: AiConfig,
|
||||||
|
pub mistral: MistralConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
@@ -20,11 +21,12 @@ pub struct ServerConfig {
|
|||||||
pub port: u16,
|
pub port: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize, Clone)]
|
||||||
pub struct PathsConfig {
|
pub struct PathsConfig {
|
||||||
pub database: String,
|
pub database: String,
|
||||||
pub logs: String,
|
pub logs: String,
|
||||||
pub uploads: String,
|
pub uploads: String,
|
||||||
|
pub max_upload_mb: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
@@ -53,6 +55,15 @@ pub struct AiConfig {
|
|||||||
pub api_key: String,
|
pub api_key: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Clone)]
|
||||||
|
pub struct MistralConfig {
|
||||||
|
pub api_key: String,
|
||||||
|
pub ocr_model: String,
|
||||||
|
pub max_pages_per_request: u32,
|
||||||
|
pub max_retries: u32,
|
||||||
|
pub timeout_secs: u64,
|
||||||
|
}
|
||||||
|
|
||||||
impl Config {
|
impl Config {
|
||||||
/// Load configuration from a YAML file.
|
/// Load configuration from a YAML file.
|
||||||
pub fn load<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
|
pub fn load<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
|
||||||
|
|||||||
@@ -5,5 +5,6 @@ pub mod biomarkers;
|
|||||||
pub mod categories;
|
pub mod categories;
|
||||||
pub mod diets;
|
pub mod diets;
|
||||||
pub mod entries;
|
pub mod entries;
|
||||||
|
pub mod ocr;
|
||||||
pub mod sources;
|
pub mod sources;
|
||||||
pub mod users;
|
pub mod users;
|
||||||
|
|||||||
180
backend/src/handlers/ocr/matching.rs
Normal file
180
backend/src/handlers/ocr/matching.rs
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
//! Biomarker matching and merging logic.
|
||||||
|
|
||||||
|
use std::collections::{HashMap, HashSet};
|
||||||
|
use strsim::jaro_winkler;
|
||||||
|
|
||||||
|
use super::types::{Biomarker, DocumentAnnotation, OcrResult};
|
||||||
|
|
||||||
|
/// Fuzzy matching threshold (0.0 - 1.0).
|
||||||
|
/// Names with Jaro-Winkler similarity >= this value are considered a match.
|
||||||
|
const FUZZY_THRESHOLD: f64 = 0.90;
|
||||||
|
|
||||||
|
/// Find a matching biomarker name from the valid set.
|
||||||
|
/// Returns the canonical name if found (exact, alias, or fuzzy match).
|
||||||
|
///
|
||||||
|
/// Matching order:
|
||||||
|
/// 1. Exact match on full name
|
||||||
|
/// 2. Extract parenthetical alias from INPUT (e.g., `(HS-CRP)` from `HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)`)
|
||||||
|
/// 3. Extract parenthetical alias from SCHEMA (e.g., `HS-CRP` matches `HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)`)
|
||||||
|
/// 4. Fuzzy match with Jaro-Winkler (threshold 0.90)
|
||||||
|
fn find_matching_biomarker(name: &str, valid_biomarkers: &HashSet<String>) -> Option<String> {
|
||||||
|
let name_upper = name.to_uppercase();
|
||||||
|
|
||||||
|
// 1. Exact match first (fast path)
|
||||||
|
if valid_biomarkers.contains(&name_upper) {
|
||||||
|
return Some(name_upper);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Try extracting parenthetical alias from INPUT
|
||||||
|
if let Some(alias) = extract_parenthetical_alias(&name_upper) {
|
||||||
|
if valid_biomarkers.contains(&alias) {
|
||||||
|
tracing::debug!(
|
||||||
|
"Alias matched '{}' -> '{}' (extracted from parentheses in input)",
|
||||||
|
name, alias
|
||||||
|
);
|
||||||
|
return Some(alias);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Try matching input against aliases in SCHEMA
|
||||||
|
// This handles input "HS-CRP" matching schema "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)"
|
||||||
|
for valid in valid_biomarkers {
|
||||||
|
if let Some(alias) = extract_parenthetical_alias(valid) {
|
||||||
|
if alias == name_upper {
|
||||||
|
tracing::debug!(
|
||||||
|
"Reverse alias matched '{}' -> '{}' (input is alias in schema)",
|
||||||
|
name, valid
|
||||||
|
);
|
||||||
|
return Some(valid.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Fuzzy match with threshold
|
||||||
|
valid_biomarkers.iter()
|
||||||
|
.map(|valid| (valid, jaro_winkler(&name_upper, valid)))
|
||||||
|
.filter(|(_, score)| *score >= FUZZY_THRESHOLD)
|
||||||
|
.max_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
|
||||||
|
.map(|(matched_name, score)| {
|
||||||
|
tracing::debug!(
|
||||||
|
"Fuzzy matched '{}' -> '{}' (score: {:.3})",
|
||||||
|
name, matched_name, score
|
||||||
|
);
|
||||||
|
matched_name.clone()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract alias from parentheses or brackets at the end of a name.
|
||||||
|
/// Examples:
|
||||||
|
/// - "HIGH SENSITIVITY C-REACTIVE PROTEIN (HS-CRP)" -> "HS-CRP"
|
||||||
|
/// - "EST. GLOMERULAR FILTRATION RATE (eGFR)" -> "EGFR"
|
||||||
|
/// - "LIPOPROTEIN (A) [LP(A)]" -> None (nested parens too complex)
|
||||||
|
fn extract_parenthetical_alias(name: &str) -> Option<String> {
|
||||||
|
let name = name.trim();
|
||||||
|
|
||||||
|
// Look for trailing (ALIAS) pattern
|
||||||
|
if let Some(start) = name.rfind('(') {
|
||||||
|
if name.ends_with(')') {
|
||||||
|
let alias = &name[start + 1..name.len() - 1];
|
||||||
|
// Only use if it looks like an abbreviation (mostly uppercase, short)
|
||||||
|
if alias.len() >= 2 && alias.len() <= 15 && !alias.contains(' ') {
|
||||||
|
return Some(alias.to_uppercase());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merge multiple OCR results into one, filtering to only known biomarkers.
|
||||||
|
/// Uses fuzzy matching to handle name variations.
|
||||||
|
pub fn merge_results(results: Vec<DocumentAnnotation>, valid_biomarkers: &HashSet<String>) -> OcrResult {
|
||||||
|
let mut merged = OcrResult {
|
||||||
|
patient_name: None,
|
||||||
|
patient_age: None,
|
||||||
|
patient_gender: None,
|
||||||
|
lab_name: None,
|
||||||
|
test_date: None,
|
||||||
|
biomarkers: Vec::new(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Track biomarkers by canonical name, prefer ones with actual values
|
||||||
|
let mut biomarker_map: HashMap<String, Biomarker> = HashMap::new();
|
||||||
|
let mut skipped_count = 0;
|
||||||
|
let mut fuzzy_matched_count = 0;
|
||||||
|
|
||||||
|
for result in results {
|
||||||
|
// Take first non-null metadata
|
||||||
|
if merged.patient_name.is_none() && result.patient_name.is_some() {
|
||||||
|
merged.patient_name = result.patient_name;
|
||||||
|
}
|
||||||
|
if merged.patient_age.is_none() && result.patient_age.is_some() {
|
||||||
|
merged.patient_age = result.patient_age;
|
||||||
|
}
|
||||||
|
if merged.patient_gender.is_none() && result.patient_gender.is_some() {
|
||||||
|
merged.patient_gender = result.patient_gender;
|
||||||
|
}
|
||||||
|
if merged.lab_name.is_none() && result.lab_name.is_some() {
|
||||||
|
merged.lab_name = result.lab_name;
|
||||||
|
}
|
||||||
|
if merged.test_date.is_none() && result.test_date.is_some() {
|
||||||
|
merged.test_date = result.test_date;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge biomarkers with fuzzy matching
|
||||||
|
if let Some(biomarkers) = result.biomarkers {
|
||||||
|
for mut bm in biomarkers {
|
||||||
|
let original_name = bm.name.clone();
|
||||||
|
|
||||||
|
// Try to find a matching canonical name
|
||||||
|
let canonical_name = match find_matching_biomarker(&bm.name, valid_biomarkers) {
|
||||||
|
Some(matched) => {
|
||||||
|
if matched != bm.name.to_uppercase() {
|
||||||
|
fuzzy_matched_count += 1;
|
||||||
|
}
|
||||||
|
// Update the biomarker name to canonical form
|
||||||
|
bm.name = matched.clone();
|
||||||
|
matched
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
tracing::debug!("Skipping unknown biomarker: {}", original_name);
|
||||||
|
skipped_count += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let has_real_value = bm.value.is_some() ||
|
||||||
|
bm.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false);
|
||||||
|
|
||||||
|
if let Some(existing) = biomarker_map.get(&canonical_name) {
|
||||||
|
let existing_has_real_value = existing.value.is_some() ||
|
||||||
|
existing.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false);
|
||||||
|
|
||||||
|
// Replace only if current has real value and existing doesn't
|
||||||
|
if has_real_value && !existing_has_real_value {
|
||||||
|
biomarker_map.insert(canonical_name, bm);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
biomarker_map.insert(canonical_name, bm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if skipped_count > 0 {
|
||||||
|
tracing::info!("Skipped {} unknown biomarkers not in schema", skipped_count);
|
||||||
|
}
|
||||||
|
if fuzzy_matched_count > 0 {
|
||||||
|
tracing::info!("Fuzzy matched {} biomarkers to canonical names", fuzzy_matched_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect biomarkers from map, filtering out "Not Provided" only entries
|
||||||
|
merged.biomarkers = biomarker_map.into_values()
|
||||||
|
.filter(|bm| {
|
||||||
|
bm.value.is_some() ||
|
||||||
|
bm.value_string.as_ref().map(|s| !s.eq_ignore_ascii_case("not provided")).unwrap_or(false)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
merged
|
||||||
|
}
|
||||||
211
backend/src/handlers/ocr/mistral.rs
Normal file
211
backend/src/handlers/ocr/mistral.rs
Normal file
@@ -0,0 +1,211 @@
|
|||||||
|
//! Mistral API integration for OCR.
|
||||||
|
|
||||||
|
use reqwest::multipart::{Form, Part};
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::time::Duration;
|
||||||
|
use tokio::fs;
|
||||||
|
|
||||||
|
use crate::config::MistralConfig;
|
||||||
|
use super::types::{Biomarker, DocumentAnnotation, MistralFileResponse, MistralOcrResponse};
|
||||||
|
use super::schema::strip_descriptions;
|
||||||
|
|
||||||
|
/// Upload a file to Mistral and return the file ID.
|
||||||
|
pub async fn upload_to_mistral(config: &MistralConfig, file_path: &PathBuf) -> Result<String, String> {
|
||||||
|
let client = reqwest::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(config.timeout_secs))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| format!("Failed to create HTTP client: {}", e))?;
|
||||||
|
|
||||||
|
let file_bytes = fs::read(file_path)
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Failed to read file: {}", e))?;
|
||||||
|
|
||||||
|
let file_name = file_path
|
||||||
|
.file_name()
|
||||||
|
.and_then(|n| n.to_str())
|
||||||
|
.unwrap_or("document.pdf")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let part = Part::bytes(file_bytes)
|
||||||
|
.file_name(file_name)
|
||||||
|
.mime_str("application/pdf")
|
||||||
|
.map_err(|e| format!("MIME error: {}", e))?;
|
||||||
|
|
||||||
|
let form = Form::new()
|
||||||
|
.text("purpose", "ocr")
|
||||||
|
.part("file", part);
|
||||||
|
|
||||||
|
let response = client
|
||||||
|
.post("https://api.mistral.ai/v1/files")
|
||||||
|
.header("Authorization", format!("Bearer {}", config.api_key))
|
||||||
|
.multipart(form)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("HTTP request failed: {}", e))?;
|
||||||
|
|
||||||
|
if !response.status().is_success() {
|
||||||
|
let error_text = response.text().await.unwrap_or_default();
|
||||||
|
return Err(format!("Mistral upload failed: {}", error_text));
|
||||||
|
}
|
||||||
|
|
||||||
|
let response_text = response.text().await
|
||||||
|
.map_err(|e| format!("Failed to read response: {}", e))?;
|
||||||
|
|
||||||
|
tracing::info!("Mistral file upload response: {}", response_text);
|
||||||
|
|
||||||
|
let result: MistralFileResponse = serde_json::from_str(&response_text)
|
||||||
|
.map_err(|e| format!("Failed to parse response: {} - raw: {}", e, response_text))?;
|
||||||
|
|
||||||
|
tracing::info!("Parsed file upload: id={}, num_pages={:?}", result.id, result.num_pages);
|
||||||
|
|
||||||
|
Ok(result.id)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Process OCR for specific pages of an uploaded document.
|
||||||
|
pub async fn ocr_pages(
|
||||||
|
config: &MistralConfig,
|
||||||
|
file_id: &str,
|
||||||
|
pages: &[usize],
|
||||||
|
) -> Result<DocumentAnnotation, String> {
|
||||||
|
let client = reqwest::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(config.timeout_secs))
|
||||||
|
.build()
|
||||||
|
.map_err(|e| format!("Failed to create HTTP client: {}", e))?;
|
||||||
|
|
||||||
|
// Load the complete schema from file
|
||||||
|
let schema_content = std::fs::read_to_string("ocr_schema.json")
|
||||||
|
.map_err(|e| format!("Failed to read ocr_schema.json: {}", e))?;
|
||||||
|
let mut schema: Value = serde_json::from_str(&schema_content)
|
||||||
|
.map_err(|e| format!("Failed to parse ocr_schema.json: {}", e))?;
|
||||||
|
|
||||||
|
// Clean the schema - remove meta-fields that Mistral echoes back
|
||||||
|
if let Some(obj) = schema.as_object_mut() {
|
||||||
|
obj.remove("$schema");
|
||||||
|
obj.remove("name");
|
||||||
|
obj.remove("description");
|
||||||
|
}
|
||||||
|
strip_descriptions(&mut schema);
|
||||||
|
|
||||||
|
let body = json!({
|
||||||
|
"model": config.ocr_model,
|
||||||
|
"document": {
|
||||||
|
"type": "file",
|
||||||
|
"file_id": file_id
|
||||||
|
},
|
||||||
|
"pages": pages,
|
||||||
|
"document_annotation_format": {
|
||||||
|
"type": "json_schema",
|
||||||
|
"json_schema": {
|
||||||
|
"name": "LabReport",
|
||||||
|
"schema": schema
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let response = client
|
||||||
|
.post("https://api.mistral.ai/v1/ocr")
|
||||||
|
.header("Authorization", format!("Bearer {}", config.api_key))
|
||||||
|
.header("Content-Type", "application/json")
|
||||||
|
.json(&body)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("OCR request failed: {}", e))?;
|
||||||
|
|
||||||
|
if !response.status().is_success() {
|
||||||
|
let error_text = response.text().await.unwrap_or_default();
|
||||||
|
return Err(format!("OCR failed: {}", error_text));
|
||||||
|
}
|
||||||
|
|
||||||
|
let result: MistralOcrResponse = response
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Failed to parse OCR response: {}", e))?;
|
||||||
|
|
||||||
|
let annotation_str = result
|
||||||
|
.document_annotation
|
||||||
|
.ok_or_else(|| "No document annotation in response".to_string())?;
|
||||||
|
|
||||||
|
tracing::debug!("Raw annotation from Mistral: {}", &annotation_str);
|
||||||
|
|
||||||
|
// Mistral returns data wrapped in "properties" - extract it
|
||||||
|
let raw_json: Value = serde_json::from_str(&annotation_str)
|
||||||
|
.map_err(|e| format!("Failed to parse raw JSON: {}", e))?;
|
||||||
|
|
||||||
|
let data_json = if let Some(props) = raw_json.get("properties") {
|
||||||
|
props.clone()
|
||||||
|
} else {
|
||||||
|
raw_json
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check if this is a schema-only response (no actual data)
|
||||||
|
if let Some(biomarkers) = data_json.get("biomarkers") {
|
||||||
|
if biomarkers.get("type").is_some() && biomarkers.get("items").is_some() {
|
||||||
|
tracing::warn!("Skipping schema-only response (no data for these pages)");
|
||||||
|
return Ok(DocumentAnnotation {
|
||||||
|
patient_name: None,
|
||||||
|
patient_age: None,
|
||||||
|
patient_gender: None,
|
||||||
|
lab_name: None,
|
||||||
|
test_date: None,
|
||||||
|
biomarkers: Some(vec![]),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let annotation = parse_annotation(&data_json)?;
|
||||||
|
|
||||||
|
tracing::info!("Parsed annotation: patient={:?}, biomarkers={}",
|
||||||
|
annotation.patient_name,
|
||||||
|
annotation.biomarkers.as_ref().map(|b| b.len()).unwrap_or(0));
|
||||||
|
|
||||||
|
Ok(annotation)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse annotation handling various Mistral response formats.
|
||||||
|
fn parse_annotation(data: &Value) -> Result<DocumentAnnotation, String> {
|
||||||
|
let patient_name = data.get("patient_name").and_then(|v| v.as_str()).map(|s| s.to_string());
|
||||||
|
let patient_age = data.get("patient_age").and_then(|v| v.as_i64()).map(|n| n as i32);
|
||||||
|
let patient_gender = data.get("patient_gender").and_then(|v| v.as_str()).map(|s| s.to_string());
|
||||||
|
let lab_name = data.get("lab_name").and_then(|v| v.as_str()).map(|s| s.to_string());
|
||||||
|
let test_date = data.get("test_date").and_then(|v| v.as_str()).map(|s| s.to_string());
|
||||||
|
|
||||||
|
// Parse biomarkers - handle nested "properties" format
|
||||||
|
let biomarkers = if let Some(bm_array) = data.get("biomarkers").and_then(|v| v.as_array()) {
|
||||||
|
let mut parsed: Vec<Biomarker> = vec![];
|
||||||
|
for item in bm_array {
|
||||||
|
// Try direct format first
|
||||||
|
if let Some(name) = item.get("name").and_then(|v| v.as_str()) {
|
||||||
|
parsed.push(Biomarker {
|
||||||
|
name: name.to_string(),
|
||||||
|
value: item.get("value").and_then(|v| v.as_f64()),
|
||||||
|
value_string: item.get("value_string").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||||
|
unit: item.get("unit").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// Try nested "properties" format
|
||||||
|
else if let Some(props) = item.get("properties") {
|
||||||
|
if let Some(name) = props.get("name").and_then(|v| v.as_str()) {
|
||||||
|
parsed.push(Biomarker {
|
||||||
|
name: name.to_string(),
|
||||||
|
value: props.get("value").and_then(|v| v.as_f64()),
|
||||||
|
value_string: props.get("value_string").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||||
|
unit: props.get("unit").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(parsed)
|
||||||
|
} else {
|
||||||
|
Some(vec![])
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(DocumentAnnotation {
|
||||||
|
patient_name,
|
||||||
|
patient_age,
|
||||||
|
patient_gender,
|
||||||
|
lab_name,
|
||||||
|
test_date,
|
||||||
|
biomarkers,
|
||||||
|
})
|
||||||
|
}
|
||||||
200
backend/src/handlers/ocr/mod.rs
Normal file
200
backend/src/handlers/ocr/mod.rs
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
//! OCR API handlers - Mistral OCR integration for document parsing.
|
||||||
|
|
||||||
|
mod matching;
|
||||||
|
mod mistral;
|
||||||
|
mod schema;
|
||||||
|
mod types;
|
||||||
|
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use axum::{
|
||||||
|
extract::{Path, State},
|
||||||
|
http::StatusCode,
|
||||||
|
Json,
|
||||||
|
};
|
||||||
|
use sea_orm::{ActiveModelTrait, EntityTrait, Set};
|
||||||
|
|
||||||
|
use crate::models::bio::source;
|
||||||
|
|
||||||
|
// Re-export public types
|
||||||
|
pub use types::{ErrorResponse, OcrState, ParseResponse};
|
||||||
|
|
||||||
|
/// Get page count from a local file.
|
||||||
|
/// For PDFs, uses lopdf to read the actual page count.
|
||||||
|
/// For other file types (images, etc.), returns 1.
|
||||||
|
fn get_page_count(file_path: &PathBuf) -> usize {
|
||||||
|
let extension = file_path.extension()
|
||||||
|
.and_then(|e| e.to_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_lowercase();
|
||||||
|
|
||||||
|
if extension == "pdf" {
|
||||||
|
match lopdf::Document::load(file_path) {
|
||||||
|
Ok(doc) => {
|
||||||
|
let count = doc.get_pages().len();
|
||||||
|
tracing::info!("PDF page count (local): {}", count);
|
||||||
|
count
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!("Failed to read PDF page count: {}, defaulting to 1", e);
|
||||||
|
1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
tracing::info!("Non-PDF file, treating as 1 page");
|
||||||
|
1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// POST /api/sources/:id/parse - Parse a source document using Mistral OCR.
|
||||||
|
pub async fn parse_source(
|
||||||
|
State(state): State<OcrState>,
|
||||||
|
Path(id): Path<i32>,
|
||||||
|
) -> Result<Json<ParseResponse>, (StatusCode, Json<ErrorResponse>)> {
|
||||||
|
// 1. Get source from database
|
||||||
|
let source_entity = source::Entity::find_by_id(id)
|
||||||
|
.one(&state.db)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
(
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
Json(ErrorResponse {
|
||||||
|
error: format!("Database error: {}", e),
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
})?
|
||||||
|
.ok_or_else(|| {
|
||||||
|
(
|
||||||
|
StatusCode::NOT_FOUND,
|
||||||
|
Json(ErrorResponse {
|
||||||
|
error: "Source not found".to_string(),
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let file_path = PathBuf::from(&source_entity.file_path);
|
||||||
|
|
||||||
|
// 2. Upload file to Mistral
|
||||||
|
let file_id = mistral::upload_to_mistral(&state.mistral, &file_path).await.map_err(|e| {
|
||||||
|
(
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
Json(ErrorResponse {
|
||||||
|
error: format!("Mistral upload failed: {}", e),
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// 3. Get page count locally from PDF
|
||||||
|
let max_pages = get_page_count(&file_path);
|
||||||
|
let chunk_size = state.mistral.max_pages_per_request as usize;
|
||||||
|
let max_retries = state.mistral.max_retries;
|
||||||
|
let mut all_results: Vec<types::DocumentAnnotation> = Vec::new();
|
||||||
|
let mut failed_chunk: Option<String> = None;
|
||||||
|
|
||||||
|
for start_page in (0..max_pages).step_by(chunk_size) {
|
||||||
|
let pages: Vec<usize> = (start_page..std::cmp::min(start_page + chunk_size, max_pages)).collect();
|
||||||
|
|
||||||
|
tracing::info!("Processing OCR for pages {:?}", pages);
|
||||||
|
|
||||||
|
// Retry loop for this chunk
|
||||||
|
let mut attempts = 0;
|
||||||
|
let mut chunk_result = None;
|
||||||
|
|
||||||
|
while attempts <= max_retries {
|
||||||
|
match mistral::ocr_pages(&state.mistral, &file_id, &pages).await {
|
||||||
|
Ok(annotation) => {
|
||||||
|
chunk_result = Some(annotation);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
if e.contains("out of range") || e.contains("no pages") || e.contains("Invalid page") {
|
||||||
|
tracing::info!("Reached end of document at pages {:?}", pages);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
attempts += 1;
|
||||||
|
if attempts <= max_retries {
|
||||||
|
tracing::warn!("OCR chunk error (pages {:?}), attempt {}/{}: {}", pages, attempts, max_retries + 1, e);
|
||||||
|
} else {
|
||||||
|
tracing::error!("OCR chunk failed after {} attempts (pages {:?}): {}", max_retries + 1, pages, e);
|
||||||
|
failed_chunk = Some(format!("Pages {:?}: {}", pages, e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(annotation) = chunk_result {
|
||||||
|
all_results.push(annotation);
|
||||||
|
} else if failed_chunk.is_some() {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fail if any chunk failed
|
||||||
|
if let Some(error_msg) = failed_chunk {
|
||||||
|
return Err((
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
Json(ErrorResponse {
|
||||||
|
error: format!("OCR parsing failed: {}", error_msg),
|
||||||
|
}),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
if all_results.is_empty() {
|
||||||
|
return Err((
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
Json(ErrorResponse {
|
||||||
|
error: "No OCR results obtained".to_string(),
|
||||||
|
}),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Get valid biomarker names from schema
|
||||||
|
let valid_biomarkers = schema::extract_valid_biomarker_names().map_err(|e| {
|
||||||
|
(
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
Json(ErrorResponse {
|
||||||
|
error: format!("Failed to read schema: {}", e),
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
tracing::info!("Loaded {} valid biomarker names from schema", valid_biomarkers.len());
|
||||||
|
|
||||||
|
// 5. Merge results with fuzzy matching
|
||||||
|
let merged = matching::merge_results(all_results, &valid_biomarkers);
|
||||||
|
|
||||||
|
// 6. Save to database
|
||||||
|
let ocr_json = serde_json::to_string(&merged).map_err(|e| {
|
||||||
|
(
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
Json(ErrorResponse {
|
||||||
|
error: format!("JSON serialization failed: {}", e),
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut active_model: source::ActiveModel = source_entity.into();
|
||||||
|
active_model.ocr_data = Set(Some(ocr_json));
|
||||||
|
|
||||||
|
active_model.update(&state.db).await.map_err(|e| {
|
||||||
|
(
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
Json(ErrorResponse {
|
||||||
|
error: format!("Database update failed: {}", e),
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(Json(ParseResponse {
|
||||||
|
success: true,
|
||||||
|
biomarkers_count: merged.biomarkers.len(),
|
||||||
|
message: format!(
|
||||||
|
"Successfully parsed {} biomarkers for {}",
|
||||||
|
merged.biomarkers.len(),
|
||||||
|
merged.patient_name.unwrap_or_else(|| "Unknown".to_string())
|
||||||
|
),
|
||||||
|
}))
|
||||||
|
}
|
||||||
49
backend/src/handlers/ocr/schema.rs
Normal file
49
backend/src/handlers/ocr/schema.rs
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
//! Schema handling utilities.
|
||||||
|
|
||||||
|
use serde_json::Value;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
/// Extract valid biomarker names from the ocr_schema.json enum.
|
||||||
|
pub fn extract_valid_biomarker_names() -> Result<HashSet<String>, String> {
|
||||||
|
let schema_content = std::fs::read_to_string("ocr_schema.json")
|
||||||
|
.map_err(|e| format!("Failed to read ocr_schema.json: {}", e))?;
|
||||||
|
let schema: Value = serde_json::from_str(&schema_content)
|
||||||
|
.map_err(|e| format!("Failed to parse ocr_schema.json: {}", e))?;
|
||||||
|
|
||||||
|
// Navigate to: properties.biomarkers.items.properties.name.enum
|
||||||
|
let names = schema
|
||||||
|
.get("properties")
|
||||||
|
.and_then(|p| p.get("biomarkers"))
|
||||||
|
.and_then(|b| b.get("items"))
|
||||||
|
.and_then(|i| i.get("properties"))
|
||||||
|
.and_then(|p| p.get("name"))
|
||||||
|
.and_then(|n| n.get("enum"))
|
||||||
|
.and_then(|e| e.as_array())
|
||||||
|
.ok_or_else(|| "Could not find biomarker name enum in schema".to_string())?;
|
||||||
|
|
||||||
|
let valid_names: HashSet<String> = names
|
||||||
|
.iter()
|
||||||
|
.filter_map(|v| v.as_str())
|
||||||
|
.map(|s| s.to_uppercase())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Ok(valid_names)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Recursively remove "description" fields from a JSON value.
|
||||||
|
pub fn strip_descriptions(value: &mut Value) {
|
||||||
|
match value {
|
||||||
|
Value::Object(map) => {
|
||||||
|
map.remove("description");
|
||||||
|
for (_, v) in map.iter_mut() {
|
||||||
|
strip_descriptions(v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Value::Array(arr) => {
|
||||||
|
for v in arr.iter_mut() {
|
||||||
|
strip_descriptions(v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
77
backend/src/handlers/ocr/types.rs
Normal file
77
backend/src/handlers/ocr/types.rs
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
//! Type definitions for OCR module.
|
||||||
|
|
||||||
|
use sea_orm::DatabaseConnection;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use crate::config::MistralConfig;
|
||||||
|
|
||||||
|
/// State for OCR handlers.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct OcrState {
|
||||||
|
pub db: DatabaseConnection,
|
||||||
|
pub uploads_path: PathBuf,
|
||||||
|
pub mistral: MistralConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Response for parse endpoint.
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct ParseResponse {
|
||||||
|
pub success: bool,
|
||||||
|
pub biomarkers_count: usize,
|
||||||
|
pub message: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Error response.
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct ErrorResponse {
|
||||||
|
pub error: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mistral file upload response.
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
pub struct MistralFileResponse {
|
||||||
|
pub id: String,
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub bytes: i64,
|
||||||
|
pub num_pages: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mistral OCR response.
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
pub struct MistralOcrResponse {
|
||||||
|
pub document_annotation: Option<String>,
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub pages: Option<Vec<serde_json::Value>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extracted biomarker from OCR.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct Biomarker {
|
||||||
|
pub name: String,
|
||||||
|
pub value: Option<f64>,
|
||||||
|
pub value_string: Option<String>,
|
||||||
|
pub unit: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merged OCR result.
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct OcrResult {
|
||||||
|
pub patient_name: Option<String>,
|
||||||
|
pub patient_age: Option<i32>,
|
||||||
|
pub patient_gender: Option<String>,
|
||||||
|
pub lab_name: Option<String>,
|
||||||
|
pub test_date: Option<String>,
|
||||||
|
pub biomarkers: Vec<Biomarker>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Document annotation from Mistral.
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct DocumentAnnotation {
|
||||||
|
pub patient_name: Option<String>,
|
||||||
|
pub patient_age: Option<i32>,
|
||||||
|
pub patient_gender: Option<String>,
|
||||||
|
pub lab_name: Option<String>,
|
||||||
|
pub test_date: Option<String>,
|
||||||
|
pub biomarkers: Option<Vec<Biomarker>>,
|
||||||
|
}
|
||||||
@@ -156,13 +156,26 @@ fn create_router(db: DatabaseConnection, config: &config::Config) -> Router {
|
|||||||
.route("/api/sources/{id}", get(handlers::sources::get_source)
|
.route("/api/sources/{id}", get(handlers::sources::get_source)
|
||||||
.delete(handlers::sources::delete_source))
|
.delete(handlers::sources::delete_source))
|
||||||
.route("/api/sources/{id}/ocr", put(handlers::sources::update_ocr))
|
.route("/api/sources/{id}/ocr", put(handlers::sources::update_ocr))
|
||||||
|
.layer(axum::extract::DefaultBodyLimit::max(config.paths.max_upload_mb as usize * 1024 * 1024))
|
||||||
.route_layer(middleware::from_fn(require_auth))
|
.route_layer(middleware::from_fn(require_auth))
|
||||||
.with_state(sources_state);
|
.with_state(sources_state);
|
||||||
|
|
||||||
|
// OCR routes (need Mistral config)
|
||||||
|
let ocr_state = handlers::ocr::OcrState {
|
||||||
|
db: db.clone(),
|
||||||
|
uploads_path: PathBuf::from(&config.paths.uploads),
|
||||||
|
mistral: config.mistral.clone(),
|
||||||
|
};
|
||||||
|
let ocr_routes = Router::new()
|
||||||
|
.route("/api/sources/{id}/parse", post(handlers::ocr::parse_source))
|
||||||
|
.route_layer(middleware::from_fn(require_auth))
|
||||||
|
.with_state(ocr_state);
|
||||||
|
|
||||||
Router::new()
|
Router::new()
|
||||||
.merge(public_routes)
|
.merge(public_routes)
|
||||||
.merge(protected_routes)
|
.merge(protected_routes)
|
||||||
.merge(sources_routes)
|
.merge(sources_routes)
|
||||||
|
.merge(ocr_routes)
|
||||||
.layer(auth_layer)
|
.layer(auth_layer)
|
||||||
.with_state(db)
|
.with_state(db)
|
||||||
}
|
}
|
||||||
@@ -185,10 +198,18 @@ async fn require_auth(
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn init_logging(config: &config::Config) {
|
fn init_logging(config: &config::Config) {
|
||||||
let log_level = config.logging.level.parse().unwrap_or(tracing::Level::INFO);
|
// Build filter: use configured level for our code, but restrict sqlx/sea_orm
|
||||||
|
let filter_str = format!(
|
||||||
|
"{},sqlx=warn,sea_orm=warn",
|
||||||
|
config.logging.level
|
||||||
|
);
|
||||||
|
|
||||||
|
let filter = tracing_subscriber::filter::EnvFilter::try_new(&filter_str)
|
||||||
|
.unwrap_or_else(|_| tracing_subscriber::filter::EnvFilter::new("info,sqlx=warn,sea_orm=warn"));
|
||||||
|
|
||||||
tracing_subscriber::registry()
|
tracing_subscriber::registry()
|
||||||
.with(tracing_subscriber::fmt::layer())
|
.with(tracing_subscriber::fmt::layer())
|
||||||
.with(tracing_subscriber::filter::LevelFilter::from_level(log_level))
|
.with(filter)
|
||||||
.init();
|
.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ export function SourcesPage() {
|
|||||||
const [error, setError] = useState<string | null>(null)
|
const [error, setError] = useState<string | null>(null)
|
||||||
const [dragOver, setDragOver] = useState(false)
|
const [dragOver, setDragOver] = useState(false)
|
||||||
const [deleteConfirmId, setDeleteConfirmId] = useState<number | null>(null)
|
const [deleteConfirmId, setDeleteConfirmId] = useState<number | null>(null)
|
||||||
|
const [parsingId, setParsingId] = useState<number | null>(null)
|
||||||
const fileInputRef = useRef<HTMLInputElement>(null)
|
const fileInputRef = useRef<HTMLInputElement>(null)
|
||||||
|
|
||||||
// Fetch sources on mount
|
// Fetch sources on mount
|
||||||
@@ -98,6 +99,31 @@ export function SourcesPage() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const handleParse = async (id: number) => {
|
||||||
|
setParsingId(id)
|
||||||
|
setError(null)
|
||||||
|
try {
|
||||||
|
const res = await fetch(`/api/sources/${id}/parse`, {
|
||||||
|
method: 'POST',
|
||||||
|
credentials: 'include',
|
||||||
|
})
|
||||||
|
if (res.ok) {
|
||||||
|
const data = await res.json()
|
||||||
|
// Refresh sources to show updated status
|
||||||
|
fetchSources()
|
||||||
|
console.log('Parsed:', data)
|
||||||
|
} else {
|
||||||
|
const err = await res.json()
|
||||||
|
setError(err.error || 'Parse failed')
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Failed to parse:', e)
|
||||||
|
setError('Failed to parse document')
|
||||||
|
} finally {
|
||||||
|
setParsingId(null)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const formatFileSize = (bytes: number) => {
|
const formatFileSize = (bytes: number) => {
|
||||||
if (bytes < 1024) return `${bytes} B`
|
if (bytes < 1024) return `${bytes} B`
|
||||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`
|
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`
|
||||||
@@ -196,7 +222,13 @@ export function SourcesPage() {
|
|||||||
<img src="/icons/general/icons8-checkmark-50.png" alt="Parsed" className="icon-sm" /> Parsed
|
<img src="/icons/general/icons8-checkmark-50.png" alt="Parsed" className="icon-sm" /> Parsed
|
||||||
</span>
|
</span>
|
||||||
) : (
|
) : (
|
||||||
<span className="text-secondary text-xs">Pending</span>
|
<button
|
||||||
|
className="btn btn-primary btn-sm"
|
||||||
|
onClick={() => handleParse(source.id)}
|
||||||
|
disabled={parsingId === source.id}
|
||||||
|
>
|
||||||
|
{parsingId === source.id ? 'Parsing...' : 'Parse'}
|
||||||
|
</button>
|
||||||
)}
|
)}
|
||||||
<button
|
<button
|
||||||
className="btn btn-danger btn-sm"
|
className="btn btn-danger btn-sm"
|
||||||
|
|||||||
Reference in New Issue
Block a user