website2pdf/main.py at shotcut-user-guide · mltframework/website2pdf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
import hashlib
import os
import re
import asyncio
import argparse
import time
from urllib.parse import urljoin, urlparse
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import (
    DictionaryObject,
    NumberObject,
    NameObject,
    ArrayObject,
    RectangleObject,
    AnnotationBuilder,
)
from reportlab.pdfgen import canvas

OUTPUT_DIR = "website_pdfs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Argument Parser to add options
parser = argparse.ArgumentParser(description="Crawl a website and save as PDFs.")
parser.add_argument(
    "root_url", type=str, help="The root URL of the website to start crawling from."
)
parser.add_argument(
    "-e",
    "--exclude",
    nargs="*",
    help="Link texts to exclude from crawling.",
    default=[],
)
parser.add_argument(
    "-L",
    "--level",
    type=int,
    help="Max depth of the crawl (0 = root page only)",
    default=0,
)
args = parser.parse_args()


async def crawl_and_save_pdf(
    url,
    visited,
    visited_hashes,
    browser,
    base_url,
    depth,
    max_depth,
    exclude_texts,
    pdf_info,
):
    normalized_url = normalize_url(url)
    if normalized_url in visited or depth > max_depth:
        return
    visited.add(normalized_url)

    # Create browser context and page
    context = await browser.new_context()
    page = await context.new_page()

    try:
        # Navigate to the page
        await page.goto(url, wait_until="load")
        await page.evaluate("document.body.style.zoom=0.8")

        # Get the page content
        content = await page.content()
        content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()

        if content_hash in visited_hashes:
            print(f"Duplicate content found at {url}, skipping.")
            return
        visited_hashes.add(content_hash)

        # Save the page as a PDF
        sanitized_url = re.sub(r"[^a-zA-Z0-9]", "_", normalized_url)
        page_path = os.path.join(OUTPUT_DIR, f"{sanitized_url}.pdf")
        await page.pdf(path=page_path)
        print(f"Saved: {url} to {page_path}")

        soup = BeautifulSoup(content, "html.parser")

        # Try to get the first <h1> tag text for a better title
        h1_tag = soup.find("h1")
        if h1_tag and h1_tag.get_text(strip=True):
            title = h1_tag.get_text(strip=True)
        else:
            # Fallback to page title
            title = await page.title()

        # Collect information for TOC
        pdf_info.append(
            {
                "title": title,
                "url": normalized_url,
                "file_path": page_path,
                "num_pages": None,  # Will fill later
                "start_page": None,  # Will fill later
            }
        )

        # Extract links for further traversal
        for link_tag in soup.find_all("a", href=True):
            link_text = link_tag.get_text(strip=True)
            href = link_tag["href"]
            next_url = urljoin(
                base_url, href
            )  # Ensure correct handling of relative URLs

            # Normalize the next URL
            normalized_next_url = normalize_url(next_url)

            # Skip links with text matching any of the excluded texts
            if any(
                len(link_text) == 0 or
                exclude_text.lower() in link_text.lower()
                for exclude_text in exclude_texts
            ):
                # print(f"Skipping link: {link_text} ({next_url})")
                continue

            # Check if the next URL is valid and belongs to the base domain
            parsed_next_url = urlparse(normalized_next_url)
            parsed_base_url = urlparse(base_url)
            if (
                parsed_next_url.netloc == parsed_base_url.netloc
                and normalized_next_url not in visited
            ):
                time.sleep(1)  # Sleep for a second to avoid being blocked
                await crawl_and_save_pdf(
                    next_url,
                    visited,
                    visited_hashes,
                    browser,
                    base_url,
                    depth + 1,
                    max_depth,
                    exclude_texts,
                    pdf_info,
                )

    except Exception as e:
        print(f"Error visiting {url}: {e}")

    finally:
        await page.close()
        await context.close()


def create_table_of_contents(toc_filename, pdf_info):
    PPI = 72  # Points per inch
    page_size = (8.5 * PPI, 11 * PPI)
    c = canvas.Canvas(toc_filename, page_size)
    title = "Shotcut User Guide"
    c.setTitle(title)

    c.setFont("Helvetica", 16)
    c.drawCentredString(0.5 * page_size[0], 0.95 * page_size[1], title)
    c.drawCentredString(0.5 * page_size[0], 0.92 * page_size[1], "Table of Contents")
    c.setFont("Helvetica", 12)
    import datetime
    current_date = datetime.datetime.now().strftime("(%B, %Y)")
    c.drawRightString(0.95 * page_size[0], 0.92 * page_size[1], current_date)

    y_position = 0.87 * page_size[1]
    link_rects = [[]]  # To store link positions for annotations
    page_number = 0

    for i, entry in enumerate(pdf_info):
        # Shorten title if it's too long
        title = (
            entry["title"] if len(entry["title"]) <= 60 else entry["title"][:57] + "..."
        )
        # link_text = f"{i + 1}. {title}"
        link_text = title

        # Add entry to TOC
        c.drawString(50, y_position, link_text)

        # Record the position for the link annotation
        text_width = c.stringWidth(link_text, "Helvetica", 12)
        x1 = 50
        y1 = y_position - 2
        x2 = x1 + text_width
        y2 = y_position + 10
        link_rect = (x1, y1, x2, y2)
        link_rects[page_number].append(link_rect)

        y_position -= 20
        # Move to next page if space runs out
        if y_position < 50:
            c.showPage()
            y_position = 0.87 * page_size[1]
            page_number += 1
            link_rects.append([])  # Add a new list for the next page

    c.save()
    print(f"Table of Contents saved as {toc_filename}")

    return link_rects  # Return the positions for later use


def combine_pdfs(output_filename, toc_filename, pdf_info):
    writer = PdfWriter()
    total_pages = 0

    # Read the TOC PDF and add its pages
    toc_reader = PdfReader(toc_filename)
    writer.append_pages_from_reader(toc_reader)
    total_pages = len(toc_reader.pages)

    # Keep track of starting page numbers
    for info in pdf_info:
        pdf_reader = PdfReader(info["file_path"])
        num_pages = len(pdf_reader.pages)
        info["num_pages"] = num_pages
        info["start_page"] = total_pages  # Page numbering starts from 0
        total_pages += num_pages
        print(f"Appending {num_pages} at {total_pages} from PDF {info['file_path']}")
        writer.append_pages_from_reader(pdf_reader)

    # Write the combined PDF without annotations first
    with open(output_filename, "wb") as f:
        writer.write(f)

    print(f"Combined PDF saved as {output_filename}")


def add_internal_links(output_filename, link_rects, pdf_info):
    reader = PdfReader(output_filename)
    writer = PdfWriter()

    # Copy pages to writer
    for page in reader.pages:
        writer.add_page(page)

    # Iterate over the links and add annotations
    links_per_page = len(link_rects[0])
    for page_number, link_rects in enumerate(link_rects):
        for link_number, rect in enumerate(link_rects):
            info = pdf_info[links_per_page * page_number + link_number]
            writer.add_annotation(page_number, AnnotationBuilder.link(rect, target_page_index=info["start_page"]))

    # Save the updated PDF with annotations
    with open(output_filename, "wb") as f:
        writer.write(f)

    print(f"Internal links added to {output_filename}")


async def main(root_url, exclude_texts, max_depth):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        visited = set()
        visited_hashes = set()
        pdf_info = []  # To store information about each crawled page
        await crawl_and_save_pdf(
            root_url,
            visited,
            visited_hashes,
            browser,
            root_url,
            0,
            max_depth,
            exclude_texts,
            pdf_info,
        )
        await browser.close()

    # Generate TOC and get link positions
    pdf_info = pdf_info[1:]  # Skip the first entry (TOC itself)
    toc_filename = "toc.pdf"
    link_rects = create_table_of_contents(toc_filename, pdf_info)

    # Combine all PDFs
    final_output = "Shotcut User Guide.pdf"
    combine_pdfs(final_output, toc_filename, pdf_info)

    # Add internal links to TOC
    add_internal_links(final_output, link_rects, pdf_info)


from urllib.parse import urljoin, urlparse, urlunparse, urlencode, parse_qsl


def normalize_url(url):
    """
    Normalize the URL to ensure consistent representation.
    """
    parsed = urlparse(url)
    # Remove fragment
    parsed = parsed._replace(fragment="")
    # Remove 'www.' prefix and convert netloc to lowercase
    netloc = parsed.netloc.lower()
    if netloc.startswith("www."):
        netloc = netloc[4:]
    # Remove default port numbers
    if netloc.endswith(":80"):
        netloc = netloc[:-3]
    elif netloc.endswith(":443"):
        netloc = netloc[:-4]
    # Remove trailing slash from path
    path = parsed.path.rstrip("/")
    # Sort query parameters (if you want to consider query parameters)
    # If you want to ignore query parameters, comment out the next two lines
    query = urlencode(sorted(parse_qsl(parsed.query)))
    parsed = parsed._replace(netloc=netloc, path=path, query=query)
    # Reconstruct the URL without the fragment
    normalized = urlunparse(parsed)
    return normalized.lower()


if __name__ == "__main__":
    asyncio.run(main(args.root_url, args.exclude, args.level))