From f23639984700abe950a49cdeab51b343ba416fc4 Mon Sep 17 00:00:00 2001 From: Jaiv Patel Date: Sat, 7 Mar 2026 11:30:22 +0530 Subject: [PATCH 1/3] Add Link Checker script with README and requirements --- LinkChecker/.gitignore | 5 ++ LinkChecker/linkchecker.py | 117 +++++++++++++++++++++++++++++++++++ LinkChecker/readme.md | 76 +++++++++++++++++++++++ LinkChecker/requirements.txt | 2 + 4 files changed, 200 insertions(+) create mode 100644 LinkChecker/.gitignore create mode 100644 LinkChecker/linkchecker.py create mode 100644 LinkChecker/readme.md create mode 100644 LinkChecker/requirements.txt diff --git a/LinkChecker/.gitignore b/LinkChecker/.gitignore new file mode 100644 index 00000000..c63db526 --- /dev/null +++ b/LinkChecker/.gitignore @@ -0,0 +1,5 @@ +__pycache__/ +*.pyc +venv/ +.env +*.csv \ No newline at end of file diff --git a/LinkChecker/linkchecker.py b/LinkChecker/linkchecker.py new file mode 100644 index 00000000..76977bea --- /dev/null +++ b/LinkChecker/linkchecker.py @@ -0,0 +1,117 @@ +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin +import csv +from datetime import datetime + +def get_all_links(url): + print(f"Fetching page: {url}") + try: + headers = {"User-Agent": "Mozilla/5.0"} + response = requests.get(url, timeout=10, headers=headers) + soup = BeautifulSoup(response.text, "html.parser") + links = set() + for tag in soup.find_all("a", href=True): + full_url = urljoin(url, tag["href"]) + if full_url.startswith("http"): + links.add(full_url) + print(f"Found {len(links)} links.") + return links + except Exception as e: + print(f"Error fetching page: {e}") + return set() + + +def check_link(url): + try: + headers = {"User-Agent": "Mozilla/5.0"} + response = requests.head(url, timeout=10, allow_redirects=True, headers=headers) + # Some servers don't support HEAD, fallback to GET + if response.status_code in [405, 403]: + response = requests.get(url, timeout=10, headers=headers) + return response.status_code + except requests.exceptions.ConnectionError: + return "Connection Error" + except requests.exceptions.Timeout: + return "Timeout" + except Exception as e: + return f"Error: {e}" + + +def get_status_label(status): + if isinstance(status, int): + if status < 300: + return " ✔ OK" + elif status < 400: + return "⚠️Redirect" + elif status == 404: + return "❌Not Found" + elif status == 403: + return "🔒Forbidden" + elif status >= 500: + return "Server Error" + return str(status) + + +def export_to_csv(results, filename): + with open(filename, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["URL", "Status", "Result"]) + for url, status in results: + label = "Working" if isinstance(status, int) and status < 400 else "Broken" + writer.writerow([url, status, label]) + print(f"Results saved to: {filename}") + + +def check_all_links(website_url): + links = get_all_links(website_url) + + if not links: + print("No links found.") + return + + broken = [] + working = [] + all_results = [] + + print(f"{'STATUS':<15} URL") + print("-" * 60) + + for i, link in enumerate(links, 1): + status = check_link(link) + label = get_status_label(status) + all_results.append((link, status)) + + print(f"{label:<15} {link}") + + if isinstance(status, int) and status < 400: + working.append((link, status)) + else: + broken.append((link, status)) + + # Summary + print("\nSUMMARY") + print("-" * 20) + print(f"Total Links: {len(links)}") + print(f"Working: {len(working)}") + print(f"Broken: {len(broken)}") + + # Broken links detail + if broken: + print("\nBROKEN LINKS:") + for url, status in broken: + print(f"[{status}] {url}") + + # Ask to export + save = input("\nSave results to CSV? (y/n): ").strip().lower() + if save == "y": + filename = f"link_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" + export_to_csv(all_results, filename) + + +if __name__ == "__main__": + print("--- Link Checker Tool ---") + website = input("Enter website URL: ").strip() + if not website.startswith("http"): + website = "https://" + website + check_all_links(website) \ No newline at end of file diff --git a/LinkChecker/readme.md b/LinkChecker/readme.md new file mode 100644 index 00000000..807e2e1a --- /dev/null +++ b/LinkChecker/readme.md @@ -0,0 +1,76 @@ +# Link Checker + +A Python script that scans a website and detects broken links. + +The tool extracts all links from a webpage, checks their HTTP status code, and reports whether they are working or broken. Results can also be exported to a CSV file. + +## Features + +- Extracts all links from a webpage +- Detects broken and working links +- Shows HTTP status codes +- Displays a summary report +- Optional CSV export of results + +## Requirements + +- Python 3.8+ + +Install dependencies: + +```bash +pip install -r requirements.txt +``` + +## Usage + +Run the script: + +```bash +python link_checker.py +``` + +Enter a website URL when prompted: + +``` +Enter website URL: https://example.com +``` + +## Example Output + +``` +--- Link Checker Tool --- + +Fetching page: https://example.com +Found 25 links. + +STATUS URL +------------------------------------------------------------ +✔ OK https://example.com/about +⚠️ Redirect https://example.com/docs +❌ Not Found https://example.com/old-page + +SUMMARY +-------------------- +Total Links: 25 +Working: 23 +Broken: 2 +``` + +## CSV Export + +After scanning, you can save results to a CSV file: + +``` +Save results to CSV? (y/n): +``` + +A report like this will be generated: + +``` +link_report_20260307_143210.csv +``` + +## License + +MIT License \ No newline at end of file diff --git a/LinkChecker/requirements.txt b/LinkChecker/requirements.txt new file mode 100644 index 00000000..a98ae430 --- /dev/null +++ b/LinkChecker/requirements.txt @@ -0,0 +1,2 @@ +requests +beautifulsoup4 \ No newline at end of file From 8a8365b8932b9e8b6b6c0b64e0bf2be56e0ac5c0 Mon Sep 17 00:00:00 2001 From: Jaiv Patel Date: Sat, 7 Mar 2026 11:38:42 +0530 Subject: [PATCH 2/3] Enhance link checker: Improve link extraction, add user-agent headers, and refine error handling --- LinkChecker/linkchecker.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/LinkChecker/linkchecker.py b/LinkChecker/linkchecker.py index 76977bea..fa748beb 100644 --- a/LinkChecker/linkchecker.py +++ b/LinkChecker/linkchecker.py @@ -4,17 +4,25 @@ import csv from datetime import datetime + def get_all_links(url): + """ + Fetches a URL and extracts all unique HTTP/HTTPS links. + """ print(f"Fetching page: {url}") try: headers = {"User-Agent": "Mozilla/5.0"} response = requests.get(url, timeout=10, headers=headers) soup = BeautifulSoup(response.text, "html.parser") links = set() + + # Find all 'a' tags with an 'href' attribute for tag in soup.find_all("a", href=True): full_url = urljoin(url, tag["href"]) + # Ensure we only check web links if full_url.startswith("http"): links.add(full_url) + print(f"Found {len(links)} links.") return links except Exception as e: @@ -23,14 +31,22 @@ def get_all_links(url): def check_link(url): + """ + Checks a single URL's status. It first tries a HEAD request for efficiency, + then falls back to a GET request if needed. + """ try: headers = {"User-Agent": "Mozilla/5.0"} + # Use a HEAD request to get status without downloading the whole page response = requests.head(url, timeout=10, allow_redirects=True, headers=headers) - # Some servers don't support HEAD, fallback to GET + + # If HEAD is not allowed (405) or forbidden (403), try a GET request if response.status_code in [405, 403]: response = requests.get(url, timeout=10, headers=headers) + return response.status_code except requests.exceptions.ConnectionError: + # Handle cases where the server is not reachable return "Connection Error" except requests.exceptions.Timeout: return "Timeout" @@ -39,6 +55,9 @@ def check_link(url): def get_status_label(status): + """ + Converts an HTTP status code or error string into a user-friendly label. + """ if isinstance(status, int): if status < 300: return " ✔ OK" @@ -54,6 +73,9 @@ def get_status_label(status): def export_to_csv(results, filename): + """ + Exports the list of checked links and their statuses to a CSV file. + """ with open(filename, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["URL", "Status", "Result"]) @@ -64,6 +86,9 @@ def export_to_csv(results, filename): def check_all_links(website_url): + """ + Main function to orchestrate the link checking process for a given website. + """ links = get_all_links(website_url) if not links: @@ -77,6 +102,7 @@ def check_all_links(website_url): print(f"{'STATUS':<15} URL") print("-" * 60) + # Iterate through all found links and check their status for i, link in enumerate(links, 1): status = check_link(link) label = get_status_label(status) @@ -84,6 +110,7 @@ def check_all_links(website_url): print(f"{label:<15} {link}") + # Categorize links as working or broken if isinstance(status, int) and status < 400: working.append((link, status)) else: @@ -109,9 +136,11 @@ def check_all_links(website_url): export_to_csv(all_results, filename) +# --- Script Entry Point --- if __name__ == "__main__": print("--- Link Checker Tool ---") website = input("Enter website URL: ").strip() + # Ensure the URL has a scheme (http or https) if not website.startswith("http"): website = "https://" + website check_all_links(website) \ No newline at end of file From dba2f7bf5ca64b952ec6893978bd9ed2d8f07ddc Mon Sep 17 00:00:00 2001 From: Jaiv Patel Date: Sat, 7 Mar 2026 12:04:02 +0530 Subject: [PATCH 3/3] Add Link Checker entry to README with description and link --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5333ecbf..a46aaa50 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ More information on contributing and the general code of conduct for discussion | JSON to YAML converter | [JSON to YAML converter](https://github.com/DhanushNehru/Python-Scripts/tree/main/JSON%20to%20YAML) | Converts JSON file to YAML files. A sample JSON is included for testing. | | Keylogger | [Keylogger](https://github.com/DhanushNehru/Python-Scripts/tree/main/Keylogger) | Keylogger that can track your keystrokes, clipboard text, take screenshots at regular intervals, and records audio. | | Keyword - Retweeting | [Keyword - Retweeting](https://github.com/DhanushNehru/Python-Scripts/tree/main/Keyword%20Retweet%20Twitter%20Bot) | Find the latest tweets containing given keywords and then retweet them. | +| Link Checker | [Link Checker](https://github.com/JaivPatel07/Python-Scripts/tree/main/LinkChecker) | A Python script that scans a website and detects broken links. | | LinkedIn Bot | [LinkedIn Bot](https://github.com/DhanushNehru/Python-Scripts/tree/main/LinkedIn%20Bot) | Automates the process of searching for public profiles on LinkedIn and exporting the data to an Excel sheet. | | Longitude & Latitude to conical coverter | [Longitude Latitude conical converter](master/Longitude%20Latitude%20conical%20converter) | Converts Longitude and Latitude to Lambert conformal conic projection. | | Mail Sender | [Mail Sender](https://github.com/DhanushNehru/Python-Scripts/tree/main/Mail%20Sender) | Sends an email. |