mirror of
https://github.com/Raine-gay/royal_road_archiver.git
synced 2024-11-27 01:18:41 -06:00
Working on epub generation. Got to work on setup_html2xhtml() next.
This commit is contained in:
parent
71763b7f07
commit
0bfcfd496f
5
.vscode/settings.json
vendored
5
.vscode/settings.json
vendored
|
@ -2,16 +2,21 @@
|
|||
"cSpell.words": [
|
||||
"archiver",
|
||||
"Audiobook",
|
||||
"dogshit",
|
||||
"epub",
|
||||
"stylesheet",
|
||||
"Webnovel"
|
||||
],
|
||||
"rust-analyzer.showUnlinkedFileNotification": false,
|
||||
"cSpell.ignoreWords": [
|
||||
"TMPDIR",
|
||||
"autotools",
|
||||
"chrono",
|
||||
"indicatif",
|
||||
"reftype",
|
||||
"reqwest",
|
||||
"royalroad",
|
||||
"tempdir",
|
||||
"ureq"
|
||||
]
|
||||
}
|
140
Cargo.lock
generated
140
Cargo.lock
generated
|
@ -279,6 +279,21 @@ version = "0.8.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
|
||||
|
||||
[[package]]
|
||||
name = "crc32fast"
|
||||
version = "1.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.31.2"
|
||||
|
@ -302,6 +317,15 @@ dependencies = [
|
|||
"syn 2.0.48",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
|
||||
dependencies = [
|
||||
"powerfmt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "0.99.17"
|
||||
|
@ -349,6 +373,23 @@ dependencies = [
|
|||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "epub-builder"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6fcc8fc7b93c7001e0d47c269aa5a30a78a1f44692dc09cc9d0f781378545e1"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"eyre",
|
||||
"html-escape",
|
||||
"log",
|
||||
"once_cell",
|
||||
"tempfile",
|
||||
"upon",
|
||||
"uuid",
|
||||
"zip",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.1"
|
||||
|
@ -365,12 +406,32 @@ dependencies = [
|
|||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "eyre"
|
||||
version = "0.6.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6267a1fa6f59179ea4afc8e50fd8612a3cc60bc858f786ff877a4a8cb042799"
|
||||
dependencies = [
|
||||
"indenter",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.0.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
|
@ -531,6 +592,15 @@ version = "0.3.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f"
|
||||
|
||||
[[package]]
|
||||
name = "html-escape"
|
||||
version = "0.2.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
|
||||
dependencies = [
|
||||
"utf8-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html2md"
|
||||
version = "0.2.14"
|
||||
|
@ -663,6 +733,12 @@ dependencies = [
|
|||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indenter"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce23b50ad8242c51a442f3ff322d56b02f08852c77e4c0b4d3fd684abc89c683"
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.1.0"
|
||||
|
@ -1081,6 +1157,12 @@ version = "1.6.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0"
|
||||
|
||||
[[package]]
|
||||
name = "powerfmt"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.17"
|
||||
|
@ -1224,6 +1306,7 @@ dependencies = [
|
|||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"epub-builder",
|
||||
"html2md",
|
||||
"indicatif",
|
||||
"path-slash",
|
||||
|
@ -1549,6 +1632,24 @@ dependencies = [
|
|||
"syn 2.0.48",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e"
|
||||
dependencies = [
|
||||
"deranged",
|
||||
"powerfmt",
|
||||
"serde",
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time-core"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.6.0"
|
||||
|
@ -1662,6 +1763,17 @@ version = "0.1.11"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
|
||||
|
||||
[[package]]
|
||||
name = "upon"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21a9260fe394dfd8ab204a8eab40f88eb9a331bb852147d24fc0aff6b30daa02"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"unicode-ident",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.5.0"
|
||||
|
@ -1679,12 +1791,27 @@ version = "0.7.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "utf8-width"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.15"
|
||||
|
@ -2010,3 +2137,16 @@ dependencies = [
|
|||
"quote",
|
||||
"syn 2.0.48",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
version = "0.6.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"crc32fast",
|
||||
"crossbeam-utils",
|
||||
"flate2",
|
||||
"time",
|
||||
]
|
||||
|
|
|
@ -17,6 +17,7 @@ path = "src/binary.rs"
|
|||
bytes = "1.5.0"
|
||||
chrono = "0.4.33"
|
||||
clap = { version = "4.4.18", features = ["derive"] }
|
||||
epub-builder = "0.7.4"
|
||||
html2md = "0.2.14"
|
||||
indicatif = "0.17.7"
|
||||
path-slash = "0.2.1"
|
||||
|
|
|
@ -43,10 +43,10 @@ fn main() {
|
|||
let output_directory: PathBuf;
|
||||
match cli_input.output_directory {
|
||||
Some(output_directory_input) => {
|
||||
output_directory = Path::new(&output_directory_input).to_path_buf();
|
||||
output_directory = PathBuf::from(&output_directory_input);
|
||||
},
|
||||
None => {
|
||||
output_directory = env::current_dir().unwrap().as_path().to_path_buf();
|
||||
output_directory = env::current_dir().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -61,7 +61,9 @@ fn main() {
|
|||
}
|
||||
}
|
||||
|
||||
// Check if the directory exists and is writeable. Creates one if not.
|
||||
/// Check if the directory exists and is writeable. Creates one if not.
|
||||
///
|
||||
/// Exits the program of failure.
|
||||
fn valid_directory_check(output_directory: &Path) {
|
||||
// Check if the directory exists, if it does not; attempt to create one.
|
||||
if !output_directory.exists() {
|
||||
|
@ -95,7 +97,7 @@ fn valid_url_check(book_url: &str) -> Url {
|
|||
}
|
||||
},
|
||||
Err(error) => {
|
||||
eprintln!("Error! Unable to parse url: {error}");
|
||||
eprintln!("Error! Unable to parse url: {book_url}\n{error}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
|
38
src/book.rs
38
src/book.rs
|
@ -1,24 +1,29 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use crate::misc::HashMapExt;
|
||||
use scraper::Html;
|
||||
use url::Url;
|
||||
|
||||
use crate::{html, http};
|
||||
use crate::{file_system_crap::remove_illegal_chars, html, http};
|
||||
|
||||
/// A struct representing a book & all the needed data to generate one.
|
||||
pub struct Book {
|
||||
/// The RoyalRoad Url for the book.
|
||||
book_url: Url,
|
||||
pub book_url: Url,
|
||||
|
||||
/// The book's title.
|
||||
pub title: String,
|
||||
|
||||
/// Book title used for the filename.
|
||||
/// Should have illegal chars expunged via file_system_crap::remove_illegal_chars.
|
||||
pub file_name_title: String,
|
||||
|
||||
/// The book's author.
|
||||
pub author: String,
|
||||
|
||||
/// A Url to the book's cover image.
|
||||
cover_image_url: Url,
|
||||
pub cover_image_url: Url,
|
||||
|
||||
/// The raw html data of the RoyalRoad index page.
|
||||
index_html: Html,
|
||||
|
@ -27,7 +32,7 @@ pub struct Book {
|
|||
pub chapters: Vec<Chapter>,
|
||||
|
||||
/// A hashmap representing the book image urls and their corresponding img html tags.
|
||||
image_urls: HashMap<Url, Vec<String>>,
|
||||
pub image_urls_and_tags: HashMap<Url, Vec<String>>,
|
||||
}
|
||||
|
||||
impl Book {
|
||||
|
@ -36,10 +41,9 @@ impl Book {
|
|||
let index_html = html::string_to_html_document(&http::get_response(book_url.clone()).get_text());
|
||||
|
||||
let chapter_names_and_urls = html::get_chapter_names_and_urls_from_index(&index_html);
|
||||
|
||||
let mut chapters: Vec<Chapter> = Vec::with_capacity(chapter_names_and_urls.len());
|
||||
|
||||
let mut image_urls: HashMap<Url, Vec<String>> = HashMap::new();
|
||||
let mut image_urls_and_tags: HashMap<Url, Vec<String>> = HashMap::new();
|
||||
|
||||
println!("\nDownloading and processing chapters:");
|
||||
// Spawn a progress bar showing how many chapters have been downloaded & processed.
|
||||
|
@ -54,8 +58,8 @@ impl Book {
|
|||
for i in 0..chapter_names_and_urls.len() {
|
||||
let chapter = Chapter::new(&chapter_names_and_urls[i][0], &chapter_names_and_urls[i][1]);
|
||||
|
||||
// extract the image urls and add em to the image_urls hashmap.
|
||||
|
||||
// extract the image urls and add em to the image_urls_and_tags hashmap.
|
||||
image_urls_and_tags = image_urls_and_tags.join(html::extract_urls_and_img_tag(&chapter.isolated_chapter_html));
|
||||
|
||||
chapters.push(chapter);
|
||||
|
||||
|
@ -64,14 +68,17 @@ impl Book {
|
|||
|
||||
progress_bar.finish();
|
||||
|
||||
let title = html::get_title_from_index(&index_html);
|
||||
|
||||
Book {
|
||||
book_url: book_url,
|
||||
title: html::get_title_from_index(&index_html),
|
||||
title: title.clone(),
|
||||
file_name_title: remove_illegal_chars(title),
|
||||
author: html::get_author_from_index(&index_html),
|
||||
cover_image_url: http::string_to_url(&html::get_cover_image_url_from_index(&index_html)),
|
||||
index_html: index_html,
|
||||
chapters: chapters,
|
||||
image_urls: image_urls,
|
||||
image_urls_and_tags,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -90,7 +97,7 @@ pub struct Chapter {
|
|||
/// The name of the chapter.
|
||||
pub chapter_name: String,
|
||||
|
||||
/// The raw html data of the page.
|
||||
/// The raw html data of the chapter page.
|
||||
raw_chapter_html: Html,
|
||||
|
||||
/// The isolated chapter html.
|
||||
|
@ -105,13 +112,8 @@ impl Chapter {
|
|||
Chapter {
|
||||
chapter_url: chapter_url,
|
||||
chapter_name: chapter_name.to_string(),
|
||||
raw_chapter_html: raw_chapter_html.clone(),
|
||||
isolated_chapter_html: html::isolate_chapter_content(raw_chapter_html)
|
||||
isolated_chapter_html: html::isolate_chapter_content(&raw_chapter_html),
|
||||
raw_chapter_html: raw_chapter_html,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO!
|
||||
struct BookCss {
|
||||
|
||||
}
|
110
src/constants.rs
Normal file
110
src/constants.rs
Normal file
|
@ -0,0 +1,110 @@
|
|||
pub const EPUB_XML_HEAD: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">"#;
|
||||
|
||||
pub const EPUB_XML_TAIL: &str = r#"</html>"#;
|
||||
|
||||
pub const EPUB_CSS: &str = r#"
|
||||
/*
|
||||
html5doctor.com Reset Stylesheet
|
||||
v1.6.1
|
||||
Last Updated: 2010-09-17
|
||||
Author: Richard Clark - http://richclarkdesign.com
|
||||
Twitter: @rich_clark
|
||||
*/
|
||||
|
||||
html, body, div, span, object, iframe,
|
||||
h1, h2, h3, h4, h5, h6, p, blockquote, pre,
|
||||
abbr, address, cite, code,
|
||||
del, dfn, em, img, ins, kbd, q, samp,
|
||||
small, strong, sub, sup, var,
|
||||
b, i,
|
||||
dl, dt, dd, ol, ul, li,
|
||||
fieldset, form, label, legend,
|
||||
table, caption, tbody, tfoot, thead, tr, th, td,
|
||||
article, aside, canvas, details, figcaption, figure,
|
||||
footer, header, hgroup, menu, nav, section, summary,
|
||||
time, mark, audio, video {
|
||||
margin:0;
|
||||
padding:0;
|
||||
border:0;
|
||||
outline:0;
|
||||
font-size:100%;
|
||||
vertical-align:baseline;
|
||||
background:transparent;
|
||||
}
|
||||
|
||||
body {
|
||||
line-height:1;
|
||||
}
|
||||
|
||||
article,aside,details,figcaption,figure,
|
||||
footer,header,hgroup,menu,nav,section {
|
||||
display:block;
|
||||
}
|
||||
|
||||
nav ul {
|
||||
list-style:none;
|
||||
}
|
||||
|
||||
blockquote, q {
|
||||
quotes:none;
|
||||
}
|
||||
|
||||
blockquote:before, blockquote:after,
|
||||
q:before, q:after {
|
||||
content:'';
|
||||
content:none;
|
||||
}
|
||||
|
||||
a {
|
||||
margin:0;
|
||||
padding:0;
|
||||
font-size:100%;
|
||||
vertical-align:baseline;
|
||||
background:transparent;
|
||||
}
|
||||
|
||||
/* change colours to suit your needs */
|
||||
ins {
|
||||
background-color:#ff9;
|
||||
color:#000;
|
||||
text-decoration:none;
|
||||
}
|
||||
|
||||
/* change colours to suit your needs */
|
||||
mark {
|
||||
background-color:#ff9;
|
||||
color:#000;
|
||||
font-style:italic;
|
||||
font-weight:bold;
|
||||
}
|
||||
|
||||
del {
|
||||
text-decoration: line-through;
|
||||
}
|
||||
|
||||
abbr[title], dfn[title] {
|
||||
border-bottom:1px dotted;
|
||||
cursor:help;
|
||||
}
|
||||
|
||||
table {
|
||||
border-collapse:collapse;
|
||||
border-spacing:0;
|
||||
}
|
||||
|
||||
/* change border colour to suit your needs */
|
||||
hr {
|
||||
display:block;
|
||||
height:1px;
|
||||
border:0;
|
||||
border-top:1px solid #cccccc;
|
||||
margin:1em 0;
|
||||
padding:0;
|
||||
}
|
||||
|
||||
input, select {
|
||||
vertical-align:middle;
|
||||
}
|
||||
"#;
|
50
src/file_system_crap.rs
Normal file
50
src/file_system_crap.rs
Normal file
|
@ -0,0 +1,50 @@
|
|||
use std::path::PathBuf;
|
||||
|
||||
use path_slash::PathBufExt as _;
|
||||
|
||||
/// Converts a given path to windows style if needed.
|
||||
pub fn convert_path_to_os_specific(path: PathBuf) -> PathBuf {
|
||||
// If target os is windows.
|
||||
#[cfg(target_os = "windows")] {
|
||||
return PathBuf::from_slash_lossy(path.into_os_string());
|
||||
}
|
||||
|
||||
// If target os is not windows.
|
||||
#[cfg(not(target_os = "windows"))] {
|
||||
return PathBuf::from_backslash_lossy(path.into_os_string());
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove chars that are illegal to be used in filenames on both unix & windows.
|
||||
pub fn remove_illegal_chars(mut string: String) -> String {
|
||||
const ILLEGAL_CHARS: [char; 9] = ['/', '\u{005C}', '<', '>', ':', '\u{0022}', '|', '?', '*'];
|
||||
|
||||
for char in ILLEGAL_CHARS {
|
||||
string = string.replace(char, " ");
|
||||
}
|
||||
|
||||
return string;
|
||||
}
|
||||
|
||||
/// Setup html2xhtml in the operating system's temp directory.
|
||||
pub fn setup_html2xhtml() {
|
||||
#[cfg(target_os = "windows")] {
|
||||
//TODO!
|
||||
// Thinking of using C:\Users\<username>\AppData\Local\Temp\html2xhtml-windows
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")] {
|
||||
// TODO!
|
||||
// Thinking of using /tmp/html2xhtml-linux
|
||||
}
|
||||
|
||||
#[cfg(target_os = "macos")] {
|
||||
// TODO!
|
||||
// You can find the macos tempdir by doing: echo $TMPDIR
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete html2xhtml from the operating system's temp directory.
|
||||
pub fn delete_html2xhtml() {
|
||||
// TODO!
|
||||
}
|
27
src/html.rs
27
src/html.rs
|
@ -4,6 +4,8 @@ use regex::Regex;
|
|||
use scraper::{Html, Selector};
|
||||
use url::Url;
|
||||
|
||||
use crate::misc::HashMapExt;
|
||||
|
||||
/// Convert a string to an html document.
|
||||
pub fn string_to_html_document(document_string: &str) -> Html {
|
||||
Html::parse_document(document_string)
|
||||
|
@ -123,7 +125,7 @@ pub fn get_chapter_names_and_urls_from_index(index_html: &Html) -> Vec<[String;
|
|||
}
|
||||
|
||||
/// Isolate chapter content from the rest of the shit on the page.
|
||||
pub fn isolate_chapter_content(raw_chapter_html: Html) -> Html {
|
||||
pub fn isolate_chapter_content(raw_chapter_html: &Html) -> Html {
|
||||
let page_html = Html::parse_document(&raw_chapter_html.html());
|
||||
|
||||
let selector = Selector::parse("div").unwrap();
|
||||
|
@ -142,7 +144,7 @@ pub fn isolate_chapter_content(raw_chapter_html: Html) -> Html {
|
|||
}
|
||||
|
||||
/// Remove all img tags from the html fragment.
|
||||
pub fn remove_image_tags(html_fragment: Html) -> String {
|
||||
pub fn remove_image_tags(html_fragment: &Html) -> String {
|
||||
let mut image_tags: Vec<String> = Vec::new();
|
||||
|
||||
let selector = Selector::parse("img").unwrap();
|
||||
|
@ -161,10 +163,29 @@ pub fn remove_image_tags(html_fragment: Html) -> String {
|
|||
return html_fragment;
|
||||
}
|
||||
|
||||
pub fn extract_urls_and_imgs_tag(chapter_html: Html) -> HashMap<Url, Vec<String>> {
|
||||
/// Extract the urls and image tags from a chapter and put them in the hashmap:
|
||||
/// ``Hashmap<Url, Vec<String>>``
|
||||
pub fn extract_urls_and_img_tag(chapter_html: &Html) -> HashMap<Url, Vec<String>> {
|
||||
let mut chapter_image_urls: HashMap<Url, Vec<String>> = HashMap::new();
|
||||
|
||||
let selector = Selector::parse("img").unwrap();
|
||||
for element in chapter_html.select(&selector) {
|
||||
let url = element.attr("src");
|
||||
let image_tag = element.html();
|
||||
|
||||
if url.is_none() { continue; }
|
||||
let url = match Url::parse(url.unwrap()) {
|
||||
Ok(url) => url,
|
||||
Err(error) => {
|
||||
eprintln!("Warning! Unable to parse url on image tag: {image_tag}\n{error}");
|
||||
continue;
|
||||
},
|
||||
};
|
||||
|
||||
let temp_map: HashMap<Url, Vec<String>> = HashMap::from([(url, vec![image_tag])]);
|
||||
|
||||
chapter_image_urls = chapter_image_urls.join(temp_map);
|
||||
}
|
||||
|
||||
return chapter_image_urls;
|
||||
}
|
|
@ -56,7 +56,7 @@ pub fn string_to_url(url: &str) -> Url {
|
|||
match Url::parse(url) {
|
||||
Ok(url) => url,
|
||||
Err(error) => {
|
||||
eprintln!("Error! Unable to parse: {url} into a valid url.");
|
||||
eprintln!("Error! Unable to parse: {url} into a valid url.\n{error}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,12 +2,16 @@ use std::{fs::OpenOptions, io::Write, path::PathBuf, process::exit};
|
|||
|
||||
use chrono::prelude::Local;
|
||||
use clap::Args;
|
||||
use epub_builder::{EpubBuilder, EpubContent, ReferenceType, ZipLibrary};
|
||||
use file_system_crap::convert_path_to_os_specific;
|
||||
use url::Url;
|
||||
|
||||
|
||||
mod book;
|
||||
mod constants;
|
||||
mod file_system_crap;
|
||||
mod html;
|
||||
mod http;
|
||||
mod misc;
|
||||
|
||||
/// struct that corresponds to arguments for Audiobook generation.
|
||||
#[derive(Args, Debug)]
|
||||
|
@ -61,7 +65,80 @@ pub fn generate_audiobook(audiobook_args: AudiobookArgs, book_url: Url, output_d
|
|||
/// This function DOES NOT do any error checking on the Url or output directory & WILL panic if they are wrong.
|
||||
/// Make sure the Url is valid and the output directory is writable BEFORE passing them to this.
|
||||
pub fn generate_epub(epub_args: EpubArgs, book_url: Url, output_directory: PathBuf) {
|
||||
// Until xhtml is working on MacOS this notice & exit code will remain.
|
||||
// See file_system_crap::setup_html2xhtml() for current status on MacOS support for this mode.
|
||||
#[cfg(target_os = "macos")] {
|
||||
eprint!("Error! This mode does not currently support MacOS. Try either html mode or markdown mode.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
let book = book::Book::new(book_url);
|
||||
|
||||
// Initialize the epub builder.
|
||||
let mut epub_builder = EpubBuilder::new(ZipLibrary::new().unwrap()).unwrap();
|
||||
|
||||
// Add author and title metadata.
|
||||
epub_builder.stylesheet(constants::EPUB_CSS.as_bytes()).unwrap(); // Use the epub_css in the constants.rs file.
|
||||
epub_builder
|
||||
.metadata("author", &book.author)
|
||||
.expect("Unable to add author metadata");
|
||||
epub_builder
|
||||
.metadata("title", &book.title)
|
||||
.expect("Unable to add title metadata");
|
||||
|
||||
// Download the cover image & add it to the epub.
|
||||
let cover_image = http::get_response(book.cover_image_url).get_bytes().to_vec();
|
||||
epub_builder.add_cover_image("cover.jpeg", cover_image.as_slice(), "image/jpeg").expect("Unable to add cover image.");
|
||||
|
||||
// Generate the cover xhtml.
|
||||
let cover_xhtml = format!(
|
||||
r#"<head></head><body><div style="text-align: center;">
|
||||
<h1><a href="{0}">{1}</a></h1>
|
||||
<img src="cover.jpeg"/>
|
||||
<h2>by: {2}</h2>
|
||||
<h3>Archived on: {3}</h3></div></body>"#,
|
||||
book.book_url,
|
||||
book.title,
|
||||
book.author,
|
||||
chrono::Local::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, false)
|
||||
);
|
||||
let cover_xhtml = format!("{0}{cover_xhtml}{1}", constants::EPUB_XML_HEAD, constants::EPUB_XML_TAIL);
|
||||
|
||||
// Add the cover xhtml to the epub.
|
||||
epub_builder.add_content(
|
||||
EpubContent::new("title.xhtml", cover_xhtml.as_bytes())
|
||||
.title("Cover")
|
||||
.reftype(ReferenceType::Cover),
|
||||
).expect("Unable to add cover");
|
||||
|
||||
// Add a table of contents after the cover page.
|
||||
epub_builder.inline_toc();
|
||||
|
||||
// Setup html2xhtml on the operating system.
|
||||
file_system_crap::setup_html2xhtml();
|
||||
|
||||
// TODO! Generate the epub body, deal with images etc etc. You know pickup from last night etc etc.
|
||||
// Finish setup_html2xhtml() first though dummy.
|
||||
|
||||
// Generate the finished epub data as a byte vector.
|
||||
let mut finished_epub: Vec<u8> = vec![];
|
||||
epub_builder.generate(&mut finished_epub).expect("Unable to generate epub data");
|
||||
|
||||
// Create the epub file and write the finished epub data to it.
|
||||
let output_path = convert_path_to_os_specific(output_directory.join(format!("{0}.epub", book.file_name_title)));
|
||||
let mut output_file = match OpenOptions::new().write(true).create_new(true).open(&output_path) {
|
||||
Ok(output_file) => output_file,
|
||||
Err(error) => {
|
||||
eprintln!("Error! Unable to create: {0}\n{error}", output_path.to_string_lossy());
|
||||
exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
output_file.write_all(finished_epub.as_slice())
|
||||
.expect(format!("Unable to write finished epub data to {0}", output_path.to_string_lossy()).as_str());
|
||||
|
||||
// Delete html2xhtml from the temp directory. It's good to clean up after yourself.
|
||||
file_system_crap::delete_html2xhtml();
|
||||
}
|
||||
|
||||
/// Generate an html archive from the given arguments, url, & outputs it to the output directory.
|
||||
|
@ -79,7 +156,7 @@ pub fn generate_html(html_args: HtmlArgs, book_url: Url, output_directory: PathB
|
|||
pub fn generate_markdown(markdown_args: MarkdownArgs, book_url: Url, output_directory: PathBuf) {
|
||||
let book = book::Book::new(book_url);
|
||||
|
||||
let output_path = convert_path_to_windows(output_directory.join(format!("{0}.md", book.title)));
|
||||
let output_path = convert_path_to_os_specific(output_directory.join(format!("{0}.md", book.file_name_title)));
|
||||
|
||||
// Create the md file. This will crash if it already exists or can not be created.
|
||||
let mut output_file = match OpenOptions::new().write(true).create_new(true).open(&output_path) {
|
||||
|
@ -110,7 +187,7 @@ pub fn generate_markdown(markdown_args: MarkdownArgs, book_url: Url, output_dire
|
|||
|
||||
if markdown_args.no_image_tags {
|
||||
// Remove image tags or not depending on args.
|
||||
buf = format!("\n\n{}\n\n", html2md::parse_html(&html::remove_image_tags(chapter.isolated_chapter_html)));
|
||||
buf = format!("\n\n{}\n\n", html2md::parse_html(&html::remove_image_tags(&chapter.isolated_chapter_html)));
|
||||
|
||||
} else {
|
||||
buf = format!("\n\n{}\n\n", html2md::parse_html(&chapter.isolated_chapter_html.html()));
|
||||
|
@ -119,18 +196,3 @@ pub fn generate_markdown(markdown_args: MarkdownArgs, book_url: Url, output_dire
|
|||
output_file.write_all(buf.as_bytes()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts a given path to windows style if needed.
|
||||
fn convert_path_to_windows(path: PathBuf) -> PathBuf {
|
||||
// If target os is windows.
|
||||
#[cfg(target_os = "windows")] {
|
||||
use path_slash::PathBufExt as _;
|
||||
|
||||
return PathBuf::from_slash(path.into_os_string().into_string().unwrap());
|
||||
}
|
||||
|
||||
// If target os is not windows.
|
||||
#[cfg(not(target_os = "windows"))] {
|
||||
return path;
|
||||
}
|
||||
}
|
34
src/misc.rs
Normal file
34
src/misc.rs
Normal file
|
@ -0,0 +1,34 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
/// An extension to ``std::collections::HashMap<K, Vec<String>>``
|
||||
pub trait HashMapExt<K> {
|
||||
/// Merges two ``Hashmap<K, Vec<String>>`` returning the merged hashmap.
|
||||
fn join(self, new_hashmap: HashMap<K, Vec<String>>) -> HashMap<K, Vec<String>>;
|
||||
}
|
||||
|
||||
|
||||
impl<K: std::cmp::Eq + std::hash::Hash + std::clone::Clone> HashMapExt<K> for HashMap<K, Vec<String>> {
|
||||
fn join(mut self, other_hashmap: HashMap<K, Vec<String>>) -> HashMap<K, Vec<String>> {
|
||||
// I am well aware that this function is dogshit for performance; but tbh I don't give enough of a shit to do anything about it.
|
||||
|
||||
for key in other_hashmap.keys() {
|
||||
if self.contains_key(key) {
|
||||
for string in &other_hashmap[key] {
|
||||
if self[key].contains(string) { continue; } // Avoid repeating strings in the vectors.
|
||||
}
|
||||
|
||||
let mut self_vector = self[key].clone();
|
||||
let mut other_vector = other_hashmap[key].clone();
|
||||
|
||||
self_vector.append(&mut other_vector);
|
||||
|
||||
self.insert(key.clone(), self_vector);
|
||||
}
|
||||
else {
|
||||
self.insert(key.clone(), other_hashmap[key].clone());
|
||||
}
|
||||
}
|
||||
|
||||
return self;
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue