1
Fork 0

No longer use tesseract, I guess?

Signed-off-by: prescientmoon <git@moonythm.dev>
This commit is contained in:
prescientmoon 2024-08-11 19:49:46 +02:00
parent 5c95cdb018
commit 48c1f74f93
Signed by: prescientmoon
SSH key fingerprint: SHA256:UUF9JT2s8Xfyv76b8ZuVL7XrmimH4o49p4b+iexbVH4
8 changed files with 177 additions and 183 deletions

1
Cargo.lock generated
View file

@ -2605,6 +2605,7 @@ dependencies = [
"num", "num",
"plotters", "plotters",
"poise", "poise",
"rand",
"sqlx", "sqlx",
"tokio", "tokio",
] ]

View file

@ -14,6 +14,7 @@ sqlx = { version = "0.8.0", features = ["sqlite", "runtime-tokio", "chrono"] }
hypertesseract = { features=["image"], git="https://github.com/BlueGhostGH/hypertesseract.git", rev="4e05063" } hypertesseract = { features=["image"], git="https://github.com/BlueGhostGH/hypertesseract.git", rev="4e05063" }
tokio = {version="1.38.0", features=["rt-multi-thread"]} tokio = {version="1.38.0", features=["rt-multi-thread"]}
imageproc = "0.25.0" imageproc = "0.25.0"
rand = "0.8.5"
[profile.dev.package."*"] [profile.dev.package."*"]
opt-level = 3 opt-level = 3

View file

@ -21,7 +21,7 @@ impl Difficulty {
pub const DIFFICULTY_SHORTHANDS: [&'static str; 5] = ["PST", "PRS", "FTR", "ETR", "BYD"]; pub const DIFFICULTY_SHORTHANDS: [&'static str; 5] = ["PST", "PRS", "FTR", "ETR", "BYD"];
pub const DIFFICULTY_STRINGS: [&'static str; 5] = pub const DIFFICULTY_STRINGS: [&'static str; 5] =
["past", "present", "future", "eternal", "beyond"]; ["PAST", "PRESENT", "FUTURE", "ETERNAL", "BEYOND"];
#[inline] #[inline]
pub fn to_index(self) -> usize { pub fn to_index(self) -> usize {

View file

@ -7,6 +7,7 @@ use crate::recognition::recognize::{ImageAnalyzer, ScoreKind};
use crate::user::{discord_it_to_discord_user, User}; use crate::user::{discord_it_to_discord_user, User};
use crate::{edit_reply, get_user, timed}; use crate::{edit_reply, get_user, timed};
use image::DynamicImage; use image::DynamicImage;
use poise::serenity_prelude::futures::future::join_all;
use poise::serenity_prelude::CreateMessage; use poise::serenity_prelude::CreateMessage;
use poise::{serenity_prelude as serenity, CreateReply}; use poise::{serenity_prelude as serenity, CreateReply};
use sqlx::query; use sqlx::query;
@ -34,7 +35,9 @@ pub async fn magic(
if files.len() == 0 { if files.len() == 0 {
ctx.reply("No images found attached to message").await?; ctx.reply("No images found attached to message").await?;
} else { return Ok(());
}
let mut embeds = Vec::with_capacity(files.len()); let mut embeds = Vec::with_capacity(files.len());
let mut attachments = Vec::with_capacity(files.len()); let mut attachments = Vec::with_capacity(files.len());
let handle = ctx let handle = ctx
@ -43,15 +46,31 @@ pub async fn magic(
let mut analyzer = ImageAnalyzer::default(); let mut analyzer = ImageAnalyzer::default();
for (i, file) in files.iter().enumerate() { // {{{ Download files
let download_tasks = files
.iter()
.filter(|file| file.dimensions().is_some())
.map(|file| async move { (file, file.download().await) });
let downloaded = timed!("dowload_files", { join_all(download_tasks).await });
if downloaded.len() < files.len() {
ctx.reply("One or more of the attached files are not images!")
.await?;
}
// }}}
for (i, (file, bytes)) in downloaded.into_iter().enumerate() {
let bytes = bytes?;
let start = Instant::now(); let start = Instant::now();
if let Some(_) = file.dimensions() { // {{{ Preapare image
let bytes = timed!("file download", { file.download().await? });
let mut image = timed!("decode image", { image::load_from_memory(&bytes)? }); let mut image = timed!("decode image", { image::load_from_memory(&bytes)? });
let mut grayscale_image = timed!("grayscale image", { let mut grayscale_image = timed!("grayscale image", {
DynamicImage::ImageLuma8(image.to_luma8()) DynamicImage::ImageLuma8(image.to_luma8())
}); });
// image = image.resize(1024, 1024, FilterType::Nearest); // image = image.resize(1024, 1024, FilterType::Nearest);
// }}}
let result: Result<(), Error> = try { let result: Result<(), Error> = try {
// {{{ Detection // {{{ Detection
@ -64,7 +83,7 @@ pub async fn magic(
// edit_reply!(ctx, handle, "Image {}: reading difficulty", i + 1).await?; // edit_reply!(ctx, handle, "Image {}: reading difficulty", i + 1).await?;
// Do not use `ocr_image` because this reads the colors // Do not use `ocr_image` because this reads the colors
let difficulty = timed!("read_difficulty", { let difficulty = timed!("read_difficulty", {
analyzer.read_difficulty(ctx.data(), &image, kind)? analyzer.read_difficulty(ctx.data(), &image, &grayscale_image, kind)?
}); });
// edit_reply!(ctx, handle, "Image {}: reading jacket", i + 1).await?; // edit_reply!(ctx, handle, "Image {}: reading jacket", i + 1).await?;
@ -72,40 +91,31 @@ pub async fn magic(
analyzer.read_jacket(ctx.data(), &mut image, kind, difficulty)? analyzer.read_jacket(ctx.data(), &mut image, kind, difficulty)?
}); });
let (note_distribution, max_recall) = match kind { let max_recall = match kind {
ScoreKind::ScoreScreen => { ScoreKind::ScoreScreen => {
edit_reply!(ctx, handle, "Image {}: reading distribution", i + 1) // edit_reply!(ctx, handle, "Image {}: reading max recall", i + 1).await?;
.await?; Some(analyzer.read_max_recall(ctx.data(), &grayscale_image)?)
let note_distribution =
Some(analyzer.read_distribution(ctx.data(), &grayscale_image)?);
edit_reply!(ctx, handle, "Image {}: reading max recall", i + 1).await?;
let max_recall =
Some(analyzer.read_max_recall(ctx.data(), &grayscale_image)?);
(note_distribution, max_recall)
} }
ScoreKind::SongSelect => (None, None), ScoreKind::SongSelect => None,
}; };
grayscale_image.invert(); grayscale_image.invert();
let note_distribution = match kind {
ScoreKind::ScoreScreen => {
// edit_reply!(ctx, handle, "Image {}: reading distribution", i + 1).await?;
Some(analyzer.read_distribution(ctx.data(), &grayscale_image)?)
}
ScoreKind::SongSelect => None,
};
// edit_reply!(ctx, handle, "Image {}: reading score", i + 1).await?; // edit_reply!(ctx, handle, "Image {}: reading score", i + 1).await?;
let score = timed!("read_score", { let score = timed!("read_score", {
analyzer.read_score( analyzer.read_score(ctx.data(), Some(chart.note_count), &grayscale_image, kind)?
ctx.data(),
Some(chart.note_count),
&grayscale_image,
kind,
)?
}); });
// {{{ Build play // {{{ Build play
let maybe_fars = Score::resolve_distibution_ambiguities( let maybe_fars =
score, Score::resolve_distibution_ambiguities(score, note_distribution, chart.note_count);
note_distribution,
chart.note_count,
);
let play = CreatePlay::new(score, &chart, &user) let play = CreatePlay::new(score, &chart, &user)
.with_attachment(file) .with_attachment(file)
@ -132,11 +142,7 @@ pub async fn magic(
.send_discord_error(ctx, &image, &file.filename, err) .send_discord_error(ctx, &image, &file.filename, err)
.await?; .await?;
} }
} else {
ctx.reply("One of the attached files is not an image!")
.await?;
continue;
}
let took = start.elapsed(); let took = start.elapsed();
edit_reply!( edit_reply!(
@ -156,7 +162,6 @@ pub async fn magic(
.send_files(ctx.http(), attachments, CreateMessage::new().embeds(embeds)) .send_files(ctx.http(), attachments, CreateMessage::new().embeds(embeds))
.await?; .await?;
} }
}
Ok(()) Ok(())
} }

View file

@ -39,7 +39,11 @@ impl UserContext {
let geosans_measurements = GEOSANS_FONT let geosans_measurements = GEOSANS_FONT
.with_borrow_mut(|font| CharMeasurements::from_text(font, "0123456789'", None))?; .with_borrow_mut(|font| CharMeasurements::from_text(font, "0123456789'", None))?;
let exo_measurements = EXO_FONT.with_borrow_mut(|font| { let exo_measurements = EXO_FONT.with_borrow_mut(|font| {
CharMeasurements::from_text(font, "0123456789'abcdefghijklmnopqrstuvwxyz", Some(700)) CharMeasurements::from_text(
font,
"0123456789'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
Some(700),
)
})?; })?;
println!("Created user context"); println!("Created user context");

View file

@ -6,10 +6,9 @@
//! allows for a convenient way to throw images into a `logs` directory with //! allows for a convenient way to throw images into a `logs` directory with
//! a simple env var. //! a simple env var.
use std::{env, ops::Deref}; use std::{env, ops::Deref, sync::OnceLock, time::Instant};
use image::{DynamicImage, EncodableLayout, ImageBuffer, PixelWithColorType}; use image::{DynamicImage, EncodableLayout, ImageBuffer, PixelWithColorType};
use poise::serenity_prelude::Timestamp;
use crate::context::Error; use crate::context::Error;
@ -20,10 +19,19 @@ fn should_save_debug_images() -> bool {
.unwrap_or(false) .unwrap_or(false)
} }
#[inline]
fn get_startup_time() -> Instant {
static CELL: OnceLock<Instant> = OnceLock::new();
*CELL.get_or_init(|| Instant::now())
}
#[inline] #[inline]
pub fn debug_image_log(image: &DynamicImage) -> Result<(), Error> { pub fn debug_image_log(image: &DynamicImage) -> Result<(), Error> {
if should_save_debug_images() { if should_save_debug_images() {
image.save(format!("./logs/{}.png", Timestamp::now()))?; image.save(format!(
"./logs/{:0>15}.png",
get_startup_time().elapsed().as_nanos()
))?;
} }
Ok(()) Ok(())
@ -37,7 +45,10 @@ where
C: Deref<Target = [P::Subpixel]>, C: Deref<Target = [P::Subpixel]>,
{ {
if should_save_debug_images() { if should_save_debug_images() {
image.save(format!("./logs/{}.png", Timestamp::now()))?; image.save(format!(
"./logs/{:0>15}.png",
get_startup_time().elapsed().as_nanos()
))?;
} }
Ok(()) Ok(())

View file

@ -1,4 +1,4 @@
//! Hyperglass my own specialized OCR system, created as a result of my //! Hyperglass is my own specialized OCR system, created as a result of my
//! annoyance with how unreliable tesseract is. Assuming we know the font, //! annoyance with how unreliable tesseract is. Assuming we know the font,
//! OCR should be almost perfect, even when faced with stange kerning. This is //! OCR should be almost perfect, even when faced with stange kerning. This is
//! what this module achieves! //! what this module achieves!
@ -158,8 +158,12 @@ struct ComponentsWithBounds {
} }
impl ComponentsWithBounds { impl ComponentsWithBounds {
fn from_image(image: &DynamicImage) -> Result<Self, Error> { fn from_image(image: &DynamicImage, binarisation_threshold: u8) -> Result<Self, Error> {
let image = threshold(&image.to_luma8(), 100, ThresholdType::Binary); let image = threshold(
&image.to_luma8(),
binarisation_threshold,
ThresholdType::Binary,
);
debug_image_buffer_log(&image)?; debug_image_buffer_log(&image)?;
let background = Luma([u8::MAX]); let background = Luma([u8::MAX]);
@ -168,7 +172,7 @@ impl ComponentsWithBounds {
let mut bounds: Vec<Option<ComponentBounds>> = Vec::new(); let mut bounds: Vec<Option<ComponentBounds>> = Vec::new();
for x in 0..components.width() { for x in 0..components.width() {
for y in 0..components.height() { for y in 0..components.height() {
// {{{ Retrieve pixel if it's not backround // {{{ Retrieve pixel if it's not background
let component = components[(x, y)].0[0]; let component = components[(x, y)].0[0];
if component == 0 { if component == 0 {
continue; continue;
@ -254,7 +258,7 @@ impl CharMeasurements {
debug_image_log(&image)?; debug_image_log(&image)?;
let components = ComponentsWithBounds::from_image(&image)?; let components = ComponentsWithBounds::from_image(&image, 100)?;
// {{{ Compute max width/height // {{{ Compute max width/height
let max_width = components let max_width = components
@ -293,9 +297,16 @@ impl CharMeasurements {
} }
// }}} // }}}
// {{{ Recognition // {{{ Recognition
pub fn recognise(&self, image: &DynamicImage, whitelist: &str) -> Result<String, Error> { pub fn recognise(
let components = timed!("from_image", { ComponentsWithBounds::from_image(image)? }); &self,
let mut result = String::new(); image: &DynamicImage,
whitelist: &str,
binarisation_threshold: Option<u8>,
) -> Result<String, Error> {
let components = timed!("from_image", {
ComponentsWithBounds::from_image(image, binarisation_threshold.unwrap_or(100))?
});
let mut result = String::with_capacity(components.bounds.len());
let max_height = components let max_height = components
.bounds .bounds

View file

@ -1,5 +1,4 @@
use std::fmt::Display; use std::fmt::Display;
use std::str::FromStr;
use hypertesseract::{PageSegMode, Tesseract}; use hypertesseract::{PageSegMode, Tesseract};
use image::imageops::FilterType; use image::imageops::FilterType;
@ -154,7 +153,7 @@ impl ImageAnalyzer {
let result = timed!("full recognition", { let result = timed!("full recognition", {
Score( Score(
measurements measurements
.recognise(&image, "0123456789'")? .recognise(&image, "0123456789'", None)?
.chars() .chars()
.filter(|c| *c != '\'') .filter(|c| *c != '\'')
.collect::<String>() .collect::<String>()
@ -182,6 +181,7 @@ impl ImageAnalyzer {
&mut self, &mut self,
ctx: &UserContext, ctx: &UserContext,
image: &DynamicImage, image: &DynamicImage,
grayscale_image: &DynamicImage,
kind: ScoreKind, kind: ScoreKind,
) -> Result<Difficulty, Error> { ) -> Result<Difficulty, Error> {
if kind == ScoreKind::SongSelect { if kind == ScoreKind::SongSelect {
@ -202,10 +202,6 @@ impl ImageAnalyzer {
) )
.unwrap(); .unwrap();
// rect.width = 100;
// rect.height = 100;
// self.crop_image_to_bytes(image, rect).unwrap();
let image_color = image.get_pixel(rect.x as u32, rect.y as u32); let image_color = image.get_pixel(rect.x as u32, rect.y as u32);
let image_color = Color::from_bytes(image_color.0); let image_color = Color::from_bytes(image_color.0);
@ -217,25 +213,15 @@ impl ImageAnalyzer {
return Ok(min.1); return Ok(min.1);
} }
let (text, conf) = Tesseract::builder() let image = self.interp_crop(
.language(hypertesseract::Language::English) ctx,
.page_seg_mode(PageSegMode::RawLine) grayscale_image,
.build()? ScoreScreen(ScoreScreenRect::Difficulty),
.recognize_text_cloned_with_conf(
&self
.interp_crop(ctx, image, ScoreScreen(ScoreScreenRect::Difficulty))?
.into_rgba8(),
)?; )?;
let text = text.trim().to_lowercase(); let text =
ctx.exo_measurements
if conf < 10 && conf != 0 { .recognise(&image, "PASTPRESENTFUTUREETERNALBEYOND", None)?;
return Err(format!(
"Difficulty text is not readable (confidence = {}, text = {}).",
conf, text
)
.into());
}
let difficulty = Difficulty::DIFFICULTIES let difficulty = Difficulty::DIFFICULTIES
.iter() .iter()
@ -256,7 +242,7 @@ impl ImageAnalyzer {
let image = self.interp_crop(ctx, image, PlayKind)?; let image = self.interp_crop(ctx, image, PlayKind)?;
let text = ctx let text = ctx
.exo_measurements .exo_measurements
.recognise(&image, "resultselectasong")?; .recognise(&image, "resultselectasong", None)?;
let result = if edit_distance(&text, "Result") < edit_distance(&text, "Select a song") { let result = if edit_distance(&text, "Result") < edit_distance(&text, "Select a song") {
ScoreKind::ScoreScreen ScoreKind::ScoreScreen
@ -356,21 +342,13 @@ impl ImageAnalyzer {
static KINDS: [ScoreScreenRect; 3] = [Pure, Far, Lost]; static KINDS: [ScoreScreenRect; 3] = [Pure, Far, Lost];
for i in 0..3 { for i in 0..3 {
let text = Tesseract::builder() let image = self.interp_crop(ctx, image, ScoreScreen(KINDS[i]))?;
.language(hypertesseract::Language::English) out[i] = ctx
.page_seg_mode(PageSegMode::SparseText) .exo_measurements
.whitelist_str("0123456789")? .recognise(&image, "0123456789", Some(30))?
.assume_numeric_input() .parse()?;
.build()?
.recognize_text_cloned(
&self
.interp_crop(ctx, image, ScoreScreen(KINDS[i]))?
.into_rgba8(),
)?;
println!("Raw '{}'", text.trim());
out[i] = u32::from_str(&text.trim()).unwrap_or(0);
} }
println!("Ditribution {out:?}"); println!("Ditribution {out:?}");
Ok((out[0], out[1], out[2])) Ok((out[0], out[1], out[2]))
@ -382,28 +360,11 @@ impl ImageAnalyzer {
ctx: &'a UserContext, ctx: &'a UserContext,
image: &DynamicImage, image: &DynamicImage,
) -> Result<u32, Error> { ) -> Result<u32, Error> {
let (text, conf) = Tesseract::builder() let image = self.interp_crop(ctx, image, ScoreScreen(ScoreScreenRect::MaxRecall))?;
.language(hypertesseract::Language::English) let max_recall = ctx
.page_seg_mode(PageSegMode::SingleLine) .exo_measurements
.whitelist_str("0123456789")? .recognise(&image, "0123456789", None)?
.assume_numeric_input() .parse()?;
.build()?
.recognize_text_cloned_with_conf(
&self
.interp_crop(ctx, image, ScoreScreen(ScoreScreenRect::MaxRecall))?
.into_rgba8(),
)?;
let max_recall = u32::from_str_radix(text.trim(), 10)?;
if conf < 20 && conf != 0 {
return Err(format!(
"Title text is not readable (confidence = {}, text = {}).",
conf,
text.trim()
)
.into());
}
Ok(max_recall) Ok(max_recall)
} }