mirror of
https://github.com/Raine-gay/royal_road_archiver.git
synced 2024-11-26 17:08:47 -06:00
Bundled both windows and linux builds of html2xhtml and just generally made a bunch of work
This commit is contained in:
parent
778b1adf6a
commit
80f3d5b423
64
.vscode/launch.json
vendored
Normal file
64
.vscode/launch.json
vendored
Normal file
|
@ -0,0 +1,64 @@
|
|||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "Debug unit tests in library 'royal_road_archiver_lib'",
|
||||
"cargo": {
|
||||
"args": [
|
||||
"test",
|
||||
"--no-run",
|
||||
"--lib",
|
||||
"--package=royal_road_archiver"
|
||||
],
|
||||
"filter": {
|
||||
"name": "royal_road_archiver_lib",
|
||||
"kind": "lib"
|
||||
}
|
||||
},
|
||||
"args": [],
|
||||
"cwd": "${workspaceFolder}"
|
||||
},
|
||||
{
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "Debug executable 'royal_road_archiver_bin'",
|
||||
"cargo": {
|
||||
"args": [
|
||||
"build",
|
||||
"--bin=royal_road_archiver_bin",
|
||||
"--package=royal_road_archiver"
|
||||
],
|
||||
"filter": {
|
||||
"name": "royal_road_archiver_bin",
|
||||
"kind": "bin"
|
||||
}
|
||||
},
|
||||
"args": ["https://www.royalroad.com/fiction/22848/post-human", "markdown"],
|
||||
"cwd": "${workspaceFolder}"
|
||||
},
|
||||
{
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "Debug unit tests in executable 'royal_road_archiver_bin'",
|
||||
"cargo": {
|
||||
"args": [
|
||||
"test",
|
||||
"--no-run",
|
||||
"--bin=royal_road_archiver_bin",
|
||||
"--package=royal_road_archiver"
|
||||
],
|
||||
"filter": {
|
||||
"name": "royal_road_archiver_bin",
|
||||
"kind": "bin"
|
||||
}
|
||||
},
|
||||
"args": [],
|
||||
"cwd": "${workspaceFolder}"
|
||||
}
|
||||
]
|
||||
}
|
5
.vscode/settings.json
vendored
5
.vscode/settings.json
vendored
|
@ -7,6 +7,9 @@
|
|||
],
|
||||
"rust-analyzer.showUnlinkedFileNotification": false,
|
||||
"cSpell.ignoreWords": [
|
||||
"royalroad"
|
||||
"autotools",
|
||||
"reqwest",
|
||||
"royalroad",
|
||||
"ureq"
|
||||
]
|
||||
}
|
1527
Cargo.lock
generated
1527
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -14,5 +14,10 @@ name = "royal_road_archiver_bin"
|
|||
path = "src/binary.rs"
|
||||
|
||||
[dependencies]
|
||||
bytes = "1.5.0"
|
||||
clap = { version = "4.4.18", features = ["derive"] }
|
||||
url = "2.5.0"
|
||||
regex = "1.10.3"
|
||||
reqwest = { version = "0.11.23", features = ["rustls", "blocking"] }
|
||||
scraper = "0.18.1"
|
||||
serde_json = "1.0.111"
|
||||
url = "2.5.0"
|
BIN
html2xhtml-linux/dtdquery
Executable file
BIN
html2xhtml-linux/dtdquery
Executable file
Binary file not shown.
BIN
html2xhtml-linux/html2xhtml
Executable file
BIN
html2xhtml-linux/html2xhtml
Executable file
Binary file not shown.
BIN
html2xhtml-windows/.libs/dtdquery.exe
Normal file
BIN
html2xhtml-windows/.libs/dtdquery.exe
Normal file
Binary file not shown.
201
html2xhtml-windows/.libs/dtdquery_ltshwrapper
Normal file
201
html2xhtml-windows/.libs/dtdquery_ltshwrapper
Normal file
|
@ -0,0 +1,201 @@
|
|||
#! /bin/sh
|
||||
|
||||
# dtdquery - temporary wrapper script for .libs/dtdquery.exe
|
||||
# Generated by libtool (GNU libtool) 2.4.2 Debian-2.4.2-1.11
|
||||
#
|
||||
# The dtdquery program cannot be directly executed until all the libtool
|
||||
# libraries that it depends on are installed.
|
||||
#
|
||||
# This wrapper script should never be moved out of the build directory.
|
||||
# If it is, it will not operate correctly.
|
||||
|
||||
# Sed substitution that helps us do robust quoting. It backslashifies
|
||||
# metacharacters that are still active within double-quoted strings.
|
||||
sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
|
||||
|
||||
# Be Bourne compatible
|
||||
if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
|
||||
emulate sh
|
||||
NULLCMD=:
|
||||
# Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
|
||||
# is contrary to our usage. Disable this feature.
|
||||
alias -g '${1+"$@"}'='"$@"'
|
||||
setopt NO_GLOB_SUBST
|
||||
else
|
||||
case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
|
||||
fi
|
||||
BIN_SH=xpg4; export BIN_SH # for Tru64
|
||||
DUALCASE=1; export DUALCASE # for MKS sh
|
||||
|
||||
# The HP-UX ksh and POSIX shell print the target directory to stdout
|
||||
# if CDPATH is set.
|
||||
(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
|
||||
|
||||
relink_command=""
|
||||
|
||||
# This environment variable determines our operation mode.
|
||||
if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
|
||||
# install mode needs the following variables:
|
||||
generated_by_libtool_version='2.4.2'
|
||||
notinst_deplibs=''
|
||||
else
|
||||
# When we are sourced in execute mode, $file and $ECHO are already set.
|
||||
if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
|
||||
file="$0"
|
||||
|
||||
# A function that is used when there is no print builtin or printf.
|
||||
func_fallback_echo ()
|
||||
{
|
||||
eval 'cat <<_LTECHO_EOF
|
||||
$1
|
||||
_LTECHO_EOF'
|
||||
}
|
||||
ECHO="printf %s\\n"
|
||||
fi
|
||||
|
||||
# Very basic option parsing. These options are (a) specific to
|
||||
# the libtool wrapper, (b) are identical between the wrapper
|
||||
# /script/ and the wrapper /executable/ which is used only on
|
||||
# windows platforms, and (c) all begin with the string --lt-
|
||||
# (application programs are unlikely to have options which match
|
||||
# this pattern).
|
||||
#
|
||||
# There are only two supported options: --lt-debug and
|
||||
# --lt-dump-script. There is, deliberately, no --lt-help.
|
||||
#
|
||||
# The first argument to this parsing function should be the
|
||||
# script's ../libtool value, followed by yes.
|
||||
lt_option_debug=
|
||||
func_parse_lt_options ()
|
||||
{
|
||||
lt_script_arg0=$0
|
||||
shift
|
||||
for lt_opt
|
||||
do
|
||||
case "$lt_opt" in
|
||||
--lt-debug) lt_option_debug=1 ;;
|
||||
--lt-dump-script)
|
||||
lt_dump_D=`$ECHO "X$lt_script_arg0" | /bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
|
||||
test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
|
||||
lt_dump_F=`$ECHO "X$lt_script_arg0" | /bin/sed -e 's/^X//' -e 's%^.*/%%'`
|
||||
cat "$lt_dump_D/$lt_dump_F"
|
||||
exit 0
|
||||
;;
|
||||
--lt-*)
|
||||
$ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Print the debug banner immediately:
|
||||
if test -n "$lt_option_debug"; then
|
||||
echo "dtdquery.exe:dtdquery:${LINENO}: libtool wrapper (GNU libtool) 2.4.2 Debian-2.4.2-1.11" 1>&2
|
||||
fi
|
||||
}
|
||||
|
||||
# Used when --lt-debug. Prints its arguments to stdout
|
||||
# (redirection is the responsibility of the caller)
|
||||
func_lt_dump_args ()
|
||||
{
|
||||
lt_dump_args_N=1;
|
||||
for lt_arg
|
||||
do
|
||||
$ECHO "dtdquery.exe:dtdquery:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
|
||||
lt_dump_args_N=`expr $lt_dump_args_N + 1`
|
||||
done
|
||||
}
|
||||
|
||||
# Core function for launching the target application
|
||||
func_exec_program_core ()
|
||||
{
|
||||
|
||||
if test -n "$lt_option_debug"; then
|
||||
$ECHO "dtdquery.exe:dtdquery:${LINENO}: newargv[0]: $progdir/$program" 1>&2
|
||||
func_lt_dump_args ${1+"$@"} 1>&2
|
||||
fi
|
||||
exec "$progdir/$program" ${1+"$@"}
|
||||
|
||||
$ECHO "$0: cannot exec $program $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# A function to encapsulate launching the target application
|
||||
# Strips options in the --lt-* namespace from $@ and
|
||||
# launches target application with the remaining arguments.
|
||||
func_exec_program ()
|
||||
{
|
||||
case " $* " in
|
||||
*\ --lt-*)
|
||||
for lt_wr_arg
|
||||
do
|
||||
case $lt_wr_arg in
|
||||
--lt-*) ;;
|
||||
*) set x "$@" "$lt_wr_arg"; shift;;
|
||||
esac
|
||||
shift
|
||||
done ;;
|
||||
esac
|
||||
func_exec_program_core ${1+"$@"}
|
||||
}
|
||||
|
||||
# Parse options
|
||||
func_parse_lt_options "$0" ${1+"$@"}
|
||||
|
||||
# Find the directory that this script lives in.
|
||||
thisdir=`$ECHO "$file" | /bin/sed 's%/[^/]*$%%'`
|
||||
test "x$thisdir" = "x$file" && thisdir=.
|
||||
|
||||
# Follow symbolic links until we get to the real thisdir.
|
||||
file=`ls -ld "$file" | /bin/sed -n 's/.*-> //p'`
|
||||
while test -n "$file"; do
|
||||
destdir=`$ECHO "$file" | /bin/sed 's%/[^/]*$%%'`
|
||||
|
||||
# If there was a directory component, then change thisdir.
|
||||
if test "x$destdir" != "x$file"; then
|
||||
case "$destdir" in
|
||||
[\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
|
||||
*) thisdir="$thisdir/$destdir" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
file=`$ECHO "$file" | /bin/sed 's%^.*/%%'`
|
||||
file=`ls -ld "$thisdir/$file" | /bin/sed -n 's/.*-> //p'`
|
||||
done
|
||||
|
||||
# Usually 'no', except on cygwin/mingw when embedded into
|
||||
# the cwrapper.
|
||||
WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=yes
|
||||
if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
|
||||
# special case for '.'
|
||||
if test "$thisdir" = "."; then
|
||||
thisdir=`pwd`
|
||||
fi
|
||||
# remove .libs from thisdir
|
||||
case "$thisdir" in
|
||||
*[\\/].libs ) thisdir=`$ECHO "$thisdir" | /bin/sed 's%[\\/][^\\/]*$%%'` ;;
|
||||
.libs ) thisdir=. ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Try to get the absolute directory name.
|
||||
absdir=`cd "$thisdir" && pwd`
|
||||
test -n "$absdir" && thisdir="$absdir"
|
||||
|
||||
program='dtdquery.exe'
|
||||
progdir="$thisdir/.libs"
|
||||
|
||||
|
||||
if test -f "$progdir/$program"; then
|
||||
if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
|
||||
# Run the actual program with our arguments.
|
||||
func_exec_program ${1+"$@"}
|
||||
fi
|
||||
else
|
||||
# The program doesn't exist.
|
||||
$ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
|
||||
$ECHO "This script is just a wrapper for $program." 1>&2
|
||||
$ECHO "See the libtool documentation for more information." 1>&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
BIN
html2xhtml-windows/.libs/html2xhtml.exe
Normal file
BIN
html2xhtml-windows/.libs/html2xhtml.exe
Normal file
Binary file not shown.
201
html2xhtml-windows/.libs/html2xhtml_ltshwrapper
Normal file
201
html2xhtml-windows/.libs/html2xhtml_ltshwrapper
Normal file
|
@ -0,0 +1,201 @@
|
|||
#! /bin/sh
|
||||
|
||||
# html2xhtml - temporary wrapper script for .libs/html2xhtml.exe
|
||||
# Generated by libtool (GNU libtool) 2.4.2 Debian-2.4.2-1.11
|
||||
#
|
||||
# The html2xhtml program cannot be directly executed until all the libtool
|
||||
# libraries that it depends on are installed.
|
||||
#
|
||||
# This wrapper script should never be moved out of the build directory.
|
||||
# If it is, it will not operate correctly.
|
||||
|
||||
# Sed substitution that helps us do robust quoting. It backslashifies
|
||||
# metacharacters that are still active within double-quoted strings.
|
||||
sed_quote_subst='s/\([`"$\\]\)/\\\1/g'
|
||||
|
||||
# Be Bourne compatible
|
||||
if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
|
||||
emulate sh
|
||||
NULLCMD=:
|
||||
# Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
|
||||
# is contrary to our usage. Disable this feature.
|
||||
alias -g '${1+"$@"}'='"$@"'
|
||||
setopt NO_GLOB_SUBST
|
||||
else
|
||||
case `(set -o) 2>/dev/null` in *posix*) set -o posix;; esac
|
||||
fi
|
||||
BIN_SH=xpg4; export BIN_SH # for Tru64
|
||||
DUALCASE=1; export DUALCASE # for MKS sh
|
||||
|
||||
# The HP-UX ksh and POSIX shell print the target directory to stdout
|
||||
# if CDPATH is set.
|
||||
(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
|
||||
|
||||
relink_command=""
|
||||
|
||||
# This environment variable determines our operation mode.
|
||||
if test "$libtool_install_magic" = "%%%MAGIC variable%%%"; then
|
||||
# install mode needs the following variables:
|
||||
generated_by_libtool_version='2.4.2'
|
||||
notinst_deplibs=''
|
||||
else
|
||||
# When we are sourced in execute mode, $file and $ECHO are already set.
|
||||
if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
|
||||
file="$0"
|
||||
|
||||
# A function that is used when there is no print builtin or printf.
|
||||
func_fallback_echo ()
|
||||
{
|
||||
eval 'cat <<_LTECHO_EOF
|
||||
$1
|
||||
_LTECHO_EOF'
|
||||
}
|
||||
ECHO="printf %s\\n"
|
||||
fi
|
||||
|
||||
# Very basic option parsing. These options are (a) specific to
|
||||
# the libtool wrapper, (b) are identical between the wrapper
|
||||
# /script/ and the wrapper /executable/ which is used only on
|
||||
# windows platforms, and (c) all begin with the string --lt-
|
||||
# (application programs are unlikely to have options which match
|
||||
# this pattern).
|
||||
#
|
||||
# There are only two supported options: --lt-debug and
|
||||
# --lt-dump-script. There is, deliberately, no --lt-help.
|
||||
#
|
||||
# The first argument to this parsing function should be the
|
||||
# script's ../libtool value, followed by yes.
|
||||
lt_option_debug=
|
||||
func_parse_lt_options ()
|
||||
{
|
||||
lt_script_arg0=$0
|
||||
shift
|
||||
for lt_opt
|
||||
do
|
||||
case "$lt_opt" in
|
||||
--lt-debug) lt_option_debug=1 ;;
|
||||
--lt-dump-script)
|
||||
lt_dump_D=`$ECHO "X$lt_script_arg0" | /bin/sed -e 's/^X//' -e 's%/[^/]*$%%'`
|
||||
test "X$lt_dump_D" = "X$lt_script_arg0" && lt_dump_D=.
|
||||
lt_dump_F=`$ECHO "X$lt_script_arg0" | /bin/sed -e 's/^X//' -e 's%^.*/%%'`
|
||||
cat "$lt_dump_D/$lt_dump_F"
|
||||
exit 0
|
||||
;;
|
||||
--lt-*)
|
||||
$ECHO "Unrecognized --lt- option: '$lt_opt'" 1>&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Print the debug banner immediately:
|
||||
if test -n "$lt_option_debug"; then
|
||||
echo "html2xhtml.exe:html2xhtml:${LINENO}: libtool wrapper (GNU libtool) 2.4.2 Debian-2.4.2-1.11" 1>&2
|
||||
fi
|
||||
}
|
||||
|
||||
# Used when --lt-debug. Prints its arguments to stdout
|
||||
# (redirection is the responsibility of the caller)
|
||||
func_lt_dump_args ()
|
||||
{
|
||||
lt_dump_args_N=1;
|
||||
for lt_arg
|
||||
do
|
||||
$ECHO "html2xhtml.exe:html2xhtml:${LINENO}: newargv[$lt_dump_args_N]: $lt_arg"
|
||||
lt_dump_args_N=`expr $lt_dump_args_N + 1`
|
||||
done
|
||||
}
|
||||
|
||||
# Core function for launching the target application
|
||||
func_exec_program_core ()
|
||||
{
|
||||
|
||||
if test -n "$lt_option_debug"; then
|
||||
$ECHO "html2xhtml.exe:html2xhtml:${LINENO}: newargv[0]: $progdir/$program" 1>&2
|
||||
func_lt_dump_args ${1+"$@"} 1>&2
|
||||
fi
|
||||
exec "$progdir/$program" ${1+"$@"}
|
||||
|
||||
$ECHO "$0: cannot exec $program $*" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# A function to encapsulate launching the target application
|
||||
# Strips options in the --lt-* namespace from $@ and
|
||||
# launches target application with the remaining arguments.
|
||||
func_exec_program ()
|
||||
{
|
||||
case " $* " in
|
||||
*\ --lt-*)
|
||||
for lt_wr_arg
|
||||
do
|
||||
case $lt_wr_arg in
|
||||
--lt-*) ;;
|
||||
*) set x "$@" "$lt_wr_arg"; shift;;
|
||||
esac
|
||||
shift
|
||||
done ;;
|
||||
esac
|
||||
func_exec_program_core ${1+"$@"}
|
||||
}
|
||||
|
||||
# Parse options
|
||||
func_parse_lt_options "$0" ${1+"$@"}
|
||||
|
||||
# Find the directory that this script lives in.
|
||||
thisdir=`$ECHO "$file" | /bin/sed 's%/[^/]*$%%'`
|
||||
test "x$thisdir" = "x$file" && thisdir=.
|
||||
|
||||
# Follow symbolic links until we get to the real thisdir.
|
||||
file=`ls -ld "$file" | /bin/sed -n 's/.*-> //p'`
|
||||
while test -n "$file"; do
|
||||
destdir=`$ECHO "$file" | /bin/sed 's%/[^/]*$%%'`
|
||||
|
||||
# If there was a directory component, then change thisdir.
|
||||
if test "x$destdir" != "x$file"; then
|
||||
case "$destdir" in
|
||||
[\\/]* | [A-Za-z]:[\\/]*) thisdir="$destdir" ;;
|
||||
*) thisdir="$thisdir/$destdir" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
file=`$ECHO "$file" | /bin/sed 's%^.*/%%'`
|
||||
file=`ls -ld "$thisdir/$file" | /bin/sed -n 's/.*-> //p'`
|
||||
done
|
||||
|
||||
# Usually 'no', except on cygwin/mingw when embedded into
|
||||
# the cwrapper.
|
||||
WRAPPER_SCRIPT_BELONGS_IN_OBJDIR=yes
|
||||
if test "$WRAPPER_SCRIPT_BELONGS_IN_OBJDIR" = "yes"; then
|
||||
# special case for '.'
|
||||
if test "$thisdir" = "."; then
|
||||
thisdir=`pwd`
|
||||
fi
|
||||
# remove .libs from thisdir
|
||||
case "$thisdir" in
|
||||
*[\\/].libs ) thisdir=`$ECHO "$thisdir" | /bin/sed 's%[\\/][^\\/]*$%%'` ;;
|
||||
.libs ) thisdir=. ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Try to get the absolute directory name.
|
||||
absdir=`cd "$thisdir" && pwd`
|
||||
test -n "$absdir" && thisdir="$absdir"
|
||||
|
||||
program='html2xhtml.exe'
|
||||
progdir="$thisdir/.libs"
|
||||
|
||||
|
||||
if test -f "$progdir/$program"; then
|
||||
if test "$libtool_execute_magic" != "%%%MAGIC variable%%%"; then
|
||||
# Run the actual program with our arguments.
|
||||
func_exec_program ${1+"$@"}
|
||||
fi
|
||||
else
|
||||
# The program doesn't exist.
|
||||
$ECHO "$0: error: \`$progdir/$program' does not exist" 1>&2
|
||||
$ECHO "This script is just a wrapper for $program." 1>&2
|
||||
$ECHO "See the libtool documentation for more information." 1>&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
1037
html2xhtml-windows/.libs/lt-dtdquery.c
Normal file
1037
html2xhtml-windows/.libs/lt-dtdquery.c
Normal file
File diff suppressed because it is too large
Load diff
1038
html2xhtml-windows/.libs/lt-html2xhtml.c
Normal file
1038
html2xhtml-windows/.libs/lt-html2xhtml.c
Normal file
File diff suppressed because it is too large
Load diff
BIN
html2xhtml-windows/dtdquery.exe
Normal file
BIN
html2xhtml-windows/dtdquery.exe
Normal file
Binary file not shown.
BIN
html2xhtml-windows/html2xhtml.exe
Normal file
BIN
html2xhtml-windows/html2xhtml.exe
Normal file
Binary file not shown.
95
src/book.rs
Normal file
95
src/book.rs
Normal file
|
@ -0,0 +1,95 @@
|
|||
use scraper::Html;
|
||||
use url::Url;
|
||||
|
||||
use crate::{html, http};
|
||||
|
||||
/// A struct representing a book & all the needed data to generate one.
|
||||
pub struct Book {
|
||||
/// The RoyalRoad Url for the book.
|
||||
book_url: Url,
|
||||
|
||||
/// The book's title.
|
||||
title: String,
|
||||
|
||||
/// The book's author.
|
||||
author: String,
|
||||
|
||||
/// A Url to the book's cover image.
|
||||
cover_image_url: Url,
|
||||
|
||||
/// The raw html data of the RoyalRoad index page.
|
||||
index_html: Html,
|
||||
|
||||
/// A vector of the book's chapters.
|
||||
chapters: Vec<Chapter>,
|
||||
}
|
||||
|
||||
impl Book {
|
||||
/// Generate a new book instance with all the needed data from a given url.
|
||||
pub fn new(book_url: Url) -> Book {
|
||||
let index_html = html::string_to_html_document(&http::get_response(book_url.clone()).get_text());
|
||||
|
||||
let chapter_names_and_urls = html::get_chapter_names_and_urls_from_index(&index_html);
|
||||
|
||||
let mut chapters: Vec<Chapter> = Vec::with_capacity(chapter_names_and_urls.len());
|
||||
|
||||
for i in 0..chapter_names_and_urls.len() {
|
||||
let chapter = Chapter::new(&chapter_names_and_urls[i][0], &chapter_names_and_urls[i][1]);
|
||||
chapters.push(chapter);
|
||||
}
|
||||
|
||||
Book {
|
||||
book_url: book_url,
|
||||
title: html::get_title_from_index(&index_html),
|
||||
author: html::get_author_from_index(&index_html),
|
||||
cover_image_url: http::string_to_url(&html::get_cover_image_url_from_index(&index_html)),
|
||||
index_html: index_html,
|
||||
chapters: chapters,
|
||||
}
|
||||
}
|
||||
|
||||
/// Count how many paragraphs are in the book.
|
||||
pub fn count_paragraphs(&self) -> u128 {
|
||||
// TODO!
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
/// A struct representing a chapter.
|
||||
struct Chapter {
|
||||
/// The Url of the chapter.
|
||||
chapter_url: Url,
|
||||
|
||||
/// The name of the chapter.
|
||||
chapter_name: String,
|
||||
|
||||
/// The raw html data of the page.
|
||||
raw_chapter_html: Html,
|
||||
|
||||
/// The isolated chapter html.
|
||||
isolated_chapter_html: Html,
|
||||
}
|
||||
|
||||
impl Chapter {
|
||||
fn new(chapter_name: &str, chapter_url: &str) -> Self {
|
||||
let chapter_url = http::string_to_url(&chapter_url);
|
||||
let raw_chapter_html = html::string_to_html_document(&http::get_response(chapter_url.clone()).get_text());
|
||||
|
||||
Chapter {
|
||||
chapter_url: chapter_url,
|
||||
chapter_name: chapter_name.to_string(),
|
||||
raw_chapter_html: raw_chapter_html.clone(),
|
||||
isolated_chapter_html: html::isolate_chapter_content(raw_chapter_html)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO!
|
||||
struct BookImages {
|
||||
|
||||
}
|
||||
|
||||
// TODO!
|
||||
struct BookCss {
|
||||
|
||||
}
|
141
src/html.rs
Normal file
141
src/html.rs
Normal file
|
@ -0,0 +1,141 @@
|
|||
use std::process::exit;
|
||||
|
||||
use regex::Regex;
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
/// Convert a string to an html document.
|
||||
pub fn string_to_html_document(document_string: &str) -> Html {
|
||||
Html::parse_document(document_string)
|
||||
}
|
||||
|
||||
/// Convert a string to an html fragment.
|
||||
pub fn string_to_html_fragment(fragment_string: &str) -> Html {
|
||||
Html::parse_fragment(fragment_string)
|
||||
}
|
||||
|
||||
/// Get the book's title from the index.
|
||||
pub fn get_title_from_index(index_html: &Html) -> String {
|
||||
let selector = Selector::parse("meta").unwrap(); // Build a selector that finds the 'meta' html tag
|
||||
for element in index_html.select(&selector) {
|
||||
// Loop through all meta tags in the html document.
|
||||
match element.value().attr("name") {
|
||||
// Check if the meta tag contains attribute: "name"
|
||||
None => continue,
|
||||
Some(x) => {
|
||||
if x == "twitter:title" {
|
||||
// If it does contain attribute "name", check if the content of that attribute is "twitter:title"
|
||||
return element.value().attr("content").unwrap().to_owned();
|
||||
// If it is, extract the data from the content attribute.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
eprintln!("Error! Unable to find book title. Royal road have probably changed their front-end code. Please report this to me on:\nhttps://github.com/Raine-gay/royal_road_archiver");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/// Get the book's author from index
|
||||
pub fn get_author_from_index(index_html: &Html) -> String {
|
||||
let selector = Selector::parse("meta").unwrap();
|
||||
for element in index_html.select(&selector) {
|
||||
match element.value().attr("property") {
|
||||
None => continue,
|
||||
Some(x) => {
|
||||
if x == "books:author" {
|
||||
return element.value().attr("content").unwrap().to_owned();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
eprintln!("Error! Unable to find book author. Royal road have probably changed their front-end code. Please report this to me on:\nhttps://github.com/Raine-gay/royal_road_archiver");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/// Get the book's cover image url from the index
|
||||
pub fn get_cover_image_url_from_index(index_html: &Html) -> String {
|
||||
let selector = Selector::parse("meta").unwrap();
|
||||
for element in index_html.select(&selector) {
|
||||
match element.value().attr("property") {
|
||||
None => continue,
|
||||
Some(x) => {
|
||||
if x == "og:image" {
|
||||
return element.value().attr("content").unwrap().to_owned();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
eprintln!("Error! Unable to find cover image url. Royal road have probably changed their front-end code. Please report this to me on:\nhttps://github.com/Raine-gay/royal_road_archiver");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/// Gets the chapter names and urls from the index.
|
||||
///
|
||||
/// This gets stored in a vector where index 0 is the chapter name, and index 1 is the url.
|
||||
pub fn get_chapter_names_and_urls_from_index(index_html: &Html) -> Vec<[String; 2]> {
|
||||
// I wont lie. I have almost 0 idea what a bunch of this shit does since it's highly specific to RoyalRoad.
|
||||
// I've commented in the gist of it, but we have no memory actually writing this function.
|
||||
|
||||
let mut chapters: Vec<[String; 2]> = Vec::new();
|
||||
let mut raw_json_data = String::new();
|
||||
|
||||
// Find a script tag that has "window.chapters" inside the inner html. This is all in json format.
|
||||
let selector = Selector::parse("script").unwrap();
|
||||
for element in index_html.select(&selector) {
|
||||
if element.inner_html().contains("window.chapters") {
|
||||
raw_json_data = element.inner_html();
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Exit it if unable to find the needed json data. That probably means royal road has changed their code.
|
||||
if raw_json_data.is_empty() {
|
||||
eprintln!("Error! Unable to find json chapter data. Royal road have probably changed their front-end code. Please report this to me on:\nhttps://github.com/Raine-gay/royal_road_archiver");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// I have absolutely no idea what this regex does; but it's probably important.
|
||||
const REGEX: &str = r#"window.chapters = (\[.*?]);"#;
|
||||
let regex = Regex::new(REGEX).unwrap();
|
||||
|
||||
// I still have no fucking clue what this magic part does; but it works so we ain't fucking touching it.
|
||||
let chapter_raw_json = regex
|
||||
.captures(&raw_json_data)
|
||||
.unwrap()
|
||||
.get(1)
|
||||
.map_or("[]", |m| m.as_str());
|
||||
|
||||
// and it just spits out json when done. Neat.
|
||||
let chapter_json: serde_json::Value = serde_json::from_str(chapter_raw_json).unwrap();
|
||||
|
||||
// For each chapter in the json, do some processing to remove the quotes then shove it onto the vector.
|
||||
for chapter in chapter_json.as_array().unwrap() {
|
||||
let chapter_name = chapter["title"].to_string().replace('"', "");
|
||||
let url = format!(
|
||||
"https://www.royalroad.com{}",
|
||||
chapter["url"].to_string().replace('"', "")
|
||||
);
|
||||
|
||||
chapters.push([chapter_name, url]);
|
||||
}
|
||||
|
||||
// Return that wanker.
|
||||
return chapters;
|
||||
}
|
||||
|
||||
/// Isolate chapter content from the rest of the shit on the page.
|
||||
pub fn isolate_chapter_content(raw_chapter_html: Html) -> Html {
|
||||
let page_html = Html::parse_document(&raw_chapter_html.html());
|
||||
|
||||
let selector = Selector::parse("div").unwrap();
|
||||
for element in page_html.select(&selector) {
|
||||
match element.value().attr("class") {
|
||||
None => continue,
|
||||
Some(x) => {
|
||||
if x == "chapter-inner chapter-content" {
|
||||
return string_to_html_fragment(&element.inner_html());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
eprintln!("Error! Unable to isolate chapter content");
|
||||
exit(1);
|
||||
}
|
63
src/http.rs
Normal file
63
src/http.rs
Normal file
|
@ -0,0 +1,63 @@
|
|||
use std::process::exit;
|
||||
|
||||
use reqwest::{blocking::Response, header::HeaderMap};
|
||||
use url::Url;
|
||||
|
||||
// A struct representing an HttpResponse and the Url it originated from.
|
||||
pub struct HttpResponse {
|
||||
url: Url,
|
||||
pub response: Response,
|
||||
}
|
||||
|
||||
impl HttpResponse {
|
||||
/// Get the response headers.
|
||||
pub fn get_headers(&self) -> &HeaderMap {
|
||||
self.response.headers()
|
||||
}
|
||||
|
||||
/// Attempt to convert the response to text. Exits the program if it fails.
|
||||
pub fn get_text(self) -> String {
|
||||
match self.response.text() {
|
||||
Ok(response_text) => response_text,
|
||||
Err(error) => {
|
||||
eprintln!("Error! Unable to convert response from {0} into text\n{error}", self.url);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to convert the response to bytes. Used for images. Exits the program if it fails.
|
||||
pub fn get_bytes(self) -> bytes::Bytes{
|
||||
match self.response.bytes() {
|
||||
Ok(response_bytes) => response_bytes,
|
||||
Err(error) => {
|
||||
eprintln!("Error! Unable to convert response from {0} into bytes\n{error}", self.url);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get an http response for a given url. Exits the program if it fails.
|
||||
pub fn get_response(url: Url) -> HttpResponse {
|
||||
let response_result = reqwest::blocking::get(url.clone());
|
||||
|
||||
match response_result {
|
||||
Ok(response) => HttpResponse { url, response },
|
||||
Err(error) => {
|
||||
eprintln!("Error! Unable to get a response from: {url}\n{error}");
|
||||
exit(1);
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// A function to convert a string to a url. Exits the program if it fails.
|
||||
pub fn string_to_url(url: &str) -> Url {
|
||||
match Url::parse(url) {
|
||||
Ok(url) => url,
|
||||
Err(error) => {
|
||||
eprintln!("Error! Unable to parse: {url} into a valid url.");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -3,6 +3,10 @@ use std::path::PathBuf;
|
|||
use clap::Args;
|
||||
use url::Url;
|
||||
|
||||
mod book;
|
||||
mod html;
|
||||
mod http;
|
||||
|
||||
/// struct that corresponds to arguments for Audiobook generation.
|
||||
#[derive(Args, Debug)]
|
||||
pub struct AudiobookArgs {
|
||||
|
@ -71,5 +75,5 @@ pub fn generate_html(html_args: HtmlArgs, book_url: Url, output_directory: PathB
|
|||
/// This function DOES NOT do any error checking on the Url or output directory & WILL panic if they are wrong.
|
||||
/// Make sure the Url is valid and the output directory is writable BEFORE passing them to this.
|
||||
pub fn generate_markdown(markdown_args: MarkdownArgs, book_url: Url, output_directory: PathBuf) {
|
||||
eprintln!("This is not implemented yet.");
|
||||
let book = book::Book::new(book_url);
|
||||
}
|
Loading…
Reference in a new issue