From 18d0f320ab895bf6c8583f13520aff7cd4a62f76 Mon Sep 17 00:00:00 2001 From: prescientmoon Date: Tue, 17 Sep 2024 03:23:45 +0200 Subject: [PATCH] Improve OCR on downscaled images --- src/recognition/hyperglass.rs | 39 +++++++++++++++++++++++------------ src/recognition/recognize.rs | 11 ++++++---- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/recognition/hyperglass.rs b/src/recognition/hyperglass.rs index 636ef99..f254ebf 100644 --- a/src/recognition/hyperglass.rs +++ b/src/recognition/hyperglass.rs @@ -72,7 +72,7 @@ impl ComponentVec { if let Some(p) = components.components.get_pixel_checked(x, y) && p.0[0] == component { - count += 1; + count += 255 - components.image[(x, y)].0[0] as u32; } } } @@ -141,6 +141,7 @@ struct ComponentBounds { } struct ComponentsWithBounds { + image: ImageBuffer, Vec>, components: ImageBuffer, Vec>, // NOTE: the index is (the id of the component) - 1 @@ -153,16 +154,17 @@ struct ComponentsWithBounds { } impl ComponentsWithBounds { - fn from_image(image: &DynamicImage, binarisation_threshold: u8) -> Result { - let image = threshold( - &image.to_luma8(), - binarisation_threshold, - ThresholdType::Binary, - ); - debug_image_buffer_log(&image); + fn from_image( + image: &DynamicImage, + binarisation_threshold: u8, + max_sizes: (f32, f32), + ) -> Result { + let luma_image = image.to_luma8(); + let binarized_image = threshold(&luma_image, binarisation_threshold, ThresholdType::Binary); + debug_image_buffer_log(&binarized_image); let background = Luma([u8::MAX]); - let components = connected_components(&image, Connectivity::Eight, background); + let components = connected_components(&binarized_image, Connectivity::Eight, background); let mut bounds: Vec> = Vec::new(); for x in 0..components.width() { @@ -198,7 +200,13 @@ impl ComponentsWithBounds { // {{{ Remove components that are too large for bound in &mut bounds { - if bound.map_or(false, |b| (b.x_max - b.x_min) >= 9 * image.width() / 10) { + if bound.map_or(false, |b| { + (b.x_max - b.x_min) as f32 >= max_sizes.0 * image.width() as f32 + }) { + *bound = None; + } else if bound.map_or(false, |b| { + (b.y_max - b.y_min) as f32 >= max_sizes.1 * image.height() as f32 + }) { *bound = None; } } @@ -210,6 +218,7 @@ impl ComponentsWithBounds { bounds_by_position.sort_by_key(|i| bounds[*i].unwrap().x_min); Ok(Self { + image: luma_image, components, bounds, bounds_by_position, @@ -254,7 +263,7 @@ impl CharMeasurements { debug_image_log(&image); - let components = ComponentsWithBounds::from_image(&image, 100)?; + let components = ComponentsWithBounds::from_image(&image, 100, (1.0, 1.0))?; // {{{ Compute max width/height let max_width = components @@ -298,9 +307,13 @@ impl CharMeasurements { image: &DynamicImage, whitelist: &str, binarisation_threshold: Option, + max_sizes: Option<(f32, f32)>, ) -> Result { - let components = - ComponentsWithBounds::from_image(image, binarisation_threshold.unwrap_or(100))?; + let components = ComponentsWithBounds::from_image( + image, + binarisation_threshold.unwrap_or(100), + max_sizes.unwrap_or((0.9, 1.0)), + )?; let mut result = String::with_capacity(components.bounds.len()); let max_height = components diff --git a/src/recognition/recognize.rs b/src/recognition/recognize.rs index 530fe3c..9bfa63f 100644 --- a/src/recognition/recognize.rs +++ b/src/recognition/recognize.rs @@ -150,7 +150,7 @@ impl ImageAnalyzer { let result = Score( measurements - .recognise(&image, "0123456789'", None)? + .recognise(&image, "0123456789'", None, None)? .chars() .filter(|c| *c != '\'') .collect::() @@ -218,6 +218,7 @@ impl ImageAnalyzer { let text = ctx.kazesawa_bold_measurements.recognise( &image, "PASTPRESENTFUTUREETERNALBEYOND", + Some(200), // We can afford to be generous with binarization here None, )?; @@ -240,7 +241,7 @@ impl ImageAnalyzer { let image = self.interp_crop(ctx, image, PlayKind)?; let text = ctx .kazesawa_measurements - .recognise(&image, "ResultSelectaSong ", None)?; + .recognise(&image, "ResultSelectaSong ", None, None)?; let result = if edit_distance(&text, "Result") < edit_distance(&text, "SelectaSong") { ScoreKind::ScoreScreen @@ -342,7 +343,8 @@ impl ImageAnalyzer { let image = self.interp_crop(ctx, image, ScoreScreen(KINDS[i]))?; out[i] = ctx .kazesawa_bold_measurements - .recognise(&image, "0123456789", Some(30))? + // We need to be very strict with binarization here + .recognise(&image, "0123456789", Some(30), Some((0.33, 0.85)))? .parse() .unwrap_or(100000); // This will get discarded as making no sense } @@ -361,7 +363,8 @@ impl ImageAnalyzer { let image = self.interp_crop(ctx, image, ScoreScreen(ScoreScreenRect::MaxRecall))?; let max_recall = ctx .exo_measurements - .recognise(&image, "0123456789", None)? + // We can afford to be generous with binarization here + .recognise(&image, "0123456789", Some(200), None)? .parse()?; Ok(max_recall)