Epub generation looks finished.

Thinking of making it use CSS files from RoyalRoad.
2024-01-26 18:55:14 +00:00 · 2024-01-26 18:55:14 +00:00 · d448b3ba42
parent d432b60541
commit d448b3ba42
3 changed files with 158 additions and 11 deletions
--- a/src/html.rs
+++ b/src/html.rs
@ -1,7 +1,8 @@
-use std::{collections::HashMap, process::exit};
+use std::{collections::HashMap, io::Write, process::{exit, Command, Stdio}};

 use regex::Regex;
 use scraper::{Html, Selector};
+use tempdir::TempDir;
 use url::Url;

 use crate::misc::HashMapExt;
@ -188,4 +189,62 @@ pub fn extract_urls_and_img_tag(chapter_html: &Html) -> HashMap<Url, Vec<String>
    }

    return chapter_image_urls;
+}
+
+/// Replace the image tag with new one that contains the new src attribute.
+pub fn replace_img_src(img_tag: String, new_src: String) -> String {
+    let img_tag = string_to_html_fragment(&img_tag);
+
+    let selector = Selector::parse("img").unwrap();
+    let element = img_tag.select(&selector).next().unwrap();
+
+
+    if element.attr("src").is_some() {
+        let image_tag = element.html();
+
+        let src_match_regex = Regex::new(r#"(src=["'].*["'])"#).unwrap();
+        let src_attr = src_match_regex.captures(&image_tag).unwrap().get(0).map(|m| m.as_str()).unwrap();
+
+        return image_tag.replace(src_attr, &format!(r#"src="{new_src}""#));
+    }
+    else {
+        return element.html();
+    }
+}
+
+/// Convert a given html dom into xhtml.
+pub fn html_to_xhtml(html: Html, html2xhtml_dir: &TempDir) -> String {
+    #[cfg(target_os = "windows")]
+    const HTML2XHTML_ENTRY: &str = "html2xhtml.exe";
+
+    #[cfg(target_os = "linux")]
+    const HTML2XHTML_ENTRY: &str = "html2xhtml";
+
+    #[cfg(target_os = "macos")]
+    const HTML2XHTML_ENTRY: &str = "html2xhtml";
+
+    // Remove nbsp, They can cause certain e-readers to crash.
+    let html = html.html().replace("&nbsp;", " ");
+
+    // Start html2xhtml.
+    let mut html2xhtml = match Command::new(html2xhtml_dir.path().join(HTML2XHTML_ENTRY))
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .spawn()
+    {
+        Ok(child) => child,
+        Err(error) => {
+            eprintln!("Error! Unable to start html2xhtml: {error}");
+            exit(1);
+        },
+    };
+
+    // Write the html to the stdin, then wait for xhtml to be outputted to the stdout.
+    html2xhtml.stdin.as_mut().unwrap().write_all(html.as_bytes()).unwrap();
+    let html2xhtml_output = html2xhtml.wait_with_output().unwrap();
+
+    // Generate a lossy string from the stdout.
+    let xhtml = String::from_utf8_lossy(&html2xhtml_output.stdout).to_string();
+
+    return xhtml;
 }
--- a/src/http.rs
+++ b/src/http.rs
@ -1,4 +1,4 @@
-use std::process::exit;
+use std::{collections::HashMap, process::exit};

 use reqwest::{blocking::Response, header::HeaderMap};
 use url::Url;
@ -36,6 +36,34 @@ impl HttpResponse {
            }
        }
    }
+
+    /// Attempt to get the content(mime)-type and file extension from the http-header.
+    /// 
+    /// If the content-type header value can not be found it will warn the use and return empty strings.
+    pub fn get_content_type_and_file_extension(&self) -> (String, String) {
+        // A hashmap to convert mime-types to file extensions.
+        let mime_to_file_extension: HashMap<&str, &str> = HashMap::from([
+            ("image/png",  "png"),
+            ("image/webp", "webp"),
+            ("image/jpeg", "jpeg"),
+            ("image/jpg",  "jpg"),
+        ]);
+
+        let content_type = match self.get_headers()["content-type"].to_str() {
+            Ok(content_type) => content_type,
+            Err(warning) => {
+                eprintln!("Warning! Unable to get content type from the http-header: {warning}");
+                return (String::with_capacity(0), String::with_capacity(0));
+            }
+        };
+
+        if mime_to_file_extension.contains_key(content_type) {
+            return (content_type.to_string(), mime_to_file_extension[content_type].to_string());
+        }
+        else {
+            return (content_type.to_string(), String::with_capacity(0));
+        }
+    }
 }

 /// Get an http response for a given url. Exits the program if it fails.
--- a/src/library.rs
+++ b/src/library.rs
@ -1,9 +1,12 @@
-use std::{fs::OpenOptions, io::Write, path::PathBuf, process::exit};
+use std::{collections::HashMap, fs::OpenOptions, io::Write, path::PathBuf, process::exit};

+use bytes::Buf;
 use chrono::prelude::Local;
 use clap::Args;
 use epub_builder::{EpubBuilder, EpubContent, ReferenceType, ZipLibrary};
 use file_system_crap::convert_path_to_os_specific;
+use html::{html_to_xhtml, remove_image_tags, string_to_html_fragment};
+use indicatif::{ProgressBar, ProgressStyle};
 use url::Url;

 mod book;
@ -80,18 +83,23 @@ pub fn generate_epub(epub_args: EpubArgs, book_url: Url, output_directory: PathB
        .expect("Unable to add title metadata");

    // Download the cover image & add it to the epub.
-    let cover_image = http::get_response(book.cover_image_url).get_bytes().to_vec();
-    epub_builder.add_cover_image("cover.jpeg", cover_image.as_slice(), "image/jpeg").expect("Unable to add cover image.");
+    let cover_image = http::get_response(book.cover_image_url);
+    let (cover_mime_type, cover_file_extension) = cover_image.get_content_type_and_file_extension();
+    epub_builder.add_cover_image(
+        format!("cover.{cover_file_extension}"), 
+        cover_image.get_bytes().to_vec().as_slice(), 
+        cover_mime_type).expect("Error! Unable to add cover image.");

    // Generate the cover xhtml.
    let cover_xhtml = format!(
        r#"<head></head><body><div style="text-align: center;">
        <h1><a href="{0}">{1}</a></h1>
-        <img src="cover.jpeg"/>
-        <h2>by: {2}</h2>
-        <h3>Archived on: {3}</h3></div></body>"#,
+        <img src="cover.{2}"/>
+        <h2>by: {3}</h2>
+        <h3>Archived on: {4}</h3></div></body>"#,
        book.book_url,
        book.title,
+        cover_file_extension,
        book.author,
        chrono::Local::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, false)
    );
@ -102,7 +110,7 @@ pub fn generate_epub(epub_args: EpubArgs, book_url: Url, output_directory: PathB
        EpubContent::new("title.xhtml", cover_xhtml.as_bytes())
            .title("Cover")
            .reftype(ReferenceType::Cover),
-    ).expect("Unable to add cover");
+    ).expect("Error! Unable to add cover");

    // Add a table of contents after the cover page.
    epub_builder.inline_toc();
@ -110,8 +118,60 @@ pub fn generate_epub(epub_args: EpubArgs, book_url: Url, output_directory: PathB
    // Setup html2xhtml on the operating system.
    let html2xhtml_dir = file_system_crap::setup_html2xhtml();

-    // TODO! Generate the epub body, deal with images etc etc. You know pickup from last night etc etc.
-    // Finish setup_html2xhtml() first though dummy.
+    let mut old_tags_new_tags: HashMap<String, String> = HashMap::new();
+
+    if !epub_args.no_images {
+        // Download the images and add em to the epub.
+
+        println!("\nDownloading and processing images:");
+        // Spawn a progress bar showing how many images have been downloaded & processed.
+        let progress_bar = ProgressBar::new(book.image_urls_and_tags.keys().len().try_into().unwrap());
+        progress_bar.set_style(
+            ProgressStyle::with_template("[{elapsed_precise}] [{wide_bar:.cyan/blue}] {percent}%  ")
+                .unwrap()
+                .progress_chars("#>-"),
+        );
+
+        let mut i: usize = 0;
+        for image_url in book.image_urls_and_tags.keys() {
+            let image = http::get_response(image_url.clone());
+            let (image_mime_type, image_file_extension) = image.get_content_type_and_file_extension();
+            epub_builder.add_resource(
+                format!("image_{i}.{image_file_extension}"), 
+                image.get_bytes().to_vec().reader(), 
+                image_mime_type).expect("Error! Unable to add content image");
+            
+            for image_tag in book.image_urls_and_tags[image_url].clone() {
+                old_tags_new_tags.insert(image_tag.clone(), html::replace_img_src(image_tag, format!("image_{i}.{image_file_extension}")));
+            }
+
+            i+=1;
+            progress_bar.inc(1);
+        }
+
+        progress_bar.finish();
+    }
+
+    // Convert the html to xhtml and add the xhtml to the epub for each chapter.
+    for (i, chapter) in book.chapters.iter().enumerate() {
+
+        let xhtml: String;
+        if epub_args.no_images {
+            xhtml = html_to_xhtml(string_to_html_fragment(&remove_image_tags(&chapter.isolated_chapter_html)), &html2xhtml_dir)
+        }
+        else {
+            let mut replaced_html = chapter.isolated_chapter_html.html();
+            for old_img_tag in old_tags_new_tags.keys() {
+                replaced_html = replaced_html.replace(&old_img_tag.clone(), &old_tags_new_tags[old_img_tag]);
+            }
+
+            xhtml = html_to_xhtml(string_to_html_fragment(&replaced_html), &html2xhtml_dir);
+        }
+
+        epub_builder.add_content(EpubContent::new(format!("chapter_{}.xhtml", i+1), xhtml.as_bytes())
+            .title(chapter.chapter_name.clone())
+            .reftype(ReferenceType::Text)).expect("Error! Unable to add chapter");
+    }

    // Generate the finished epub data as a byte vector.
    let mut finished_epub: Vec<u8> = vec![];