diff --git a/LinkChecker/.gitignore b/LinkChecker/.gitignore new file mode 100644 index 00000000..c63db526 --- /dev/null +++ b/LinkChecker/.gitignore @@ -0,0 +1,5 @@ +__pycache__/ +*.pyc +venv/ +.env +*.csv \ No newline at end of file diff --git a/LinkChecker/linkchecker.py b/LinkChecker/linkchecker.py new file mode 100644 index 00000000..fa748beb --- /dev/null +++ b/LinkChecker/linkchecker.py @@ -0,0 +1,146 @@ +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin +import csv +from datetime import datetime + + +def get_all_links(url): + """ + Fetches a URL and extracts all unique HTTP/HTTPS links. + """ + print(f"Fetching page: {url}") + try: + headers = {"User-Agent": "Mozilla/5.0"} + response = requests.get(url, timeout=10, headers=headers) + soup = BeautifulSoup(response.text, "html.parser") + links = set() + + # Find all 'a' tags with an 'href' attribute + for tag in soup.find_all("a", href=True): + full_url = urljoin(url, tag["href"]) + # Ensure we only check web links + if full_url.startswith("http"): + links.add(full_url) + + print(f"Found {len(links)} links.") + return links + except Exception as e: + print(f"Error fetching page: {e}") + return set() + + +def check_link(url): + """ + Checks a single URL's status. It first tries a HEAD request for efficiency, + then falls back to a GET request if needed. + """ + try: + headers = {"User-Agent": "Mozilla/5.0"} + # Use a HEAD request to get status without downloading the whole page + response = requests.head(url, timeout=10, allow_redirects=True, headers=headers) + + # If HEAD is not allowed (405) or forbidden (403), try a GET request + if response.status_code in [405, 403]: + response = requests.get(url, timeout=10, headers=headers) + + return response.status_code + except requests.exceptions.ConnectionError: + # Handle cases where the server is not reachable + return "Connection Error" + except requests.exceptions.Timeout: + return "Timeout" + except Exception as e: + return f"Error: {e}" + + +def get_status_label(status): + """ + Converts an HTTP status code or error string into a user-friendly label. + """ + if isinstance(status, int): + if status < 300: + return " ✔ OK" + elif status < 400: + return "⚠️Redirect" + elif status == 404: + return "❌Not Found" + elif status == 403: + return "🔒Forbidden" + elif status >= 500: + return "Server Error" + return str(status) + + +def export_to_csv(results, filename): + """ + Exports the list of checked links and their statuses to a CSV file. + """ + with open(filename, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["URL", "Status", "Result"]) + for url, status in results: + label = "Working" if isinstance(status, int) and status < 400 else "Broken" + writer.writerow([url, status, label]) + print(f"Results saved to: {filename}") + + +def check_all_links(website_url): + """ + Main function to orchestrate the link checking process for a given website. + """ + links = get_all_links(website_url) + + if not links: + print("No links found.") + return + + broken = [] + working = [] + all_results = [] + + print(f"{'STATUS':<15} URL") + print("-" * 60) + + # Iterate through all found links and check their status + for i, link in enumerate(links, 1): + status = check_link(link) + label = get_status_label(status) + all_results.append((link, status)) + + print(f"{label:<15} {link}") + + # Categorize links as working or broken + if isinstance(status, int) and status < 400: + working.append((link, status)) + else: + broken.append((link, status)) + + # Summary + print("\nSUMMARY") + print("-" * 20) + print(f"Total Links: {len(links)}") + print(f"Working: {len(working)}") + print(f"Broken: {len(broken)}") + + # Broken links detail + if broken: + print("\nBROKEN LINKS:") + for url, status in broken: + print(f"[{status}] {url}") + + # Ask to export + save = input("\nSave results to CSV? (y/n): ").strip().lower() + if save == "y": + filename = f"link_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" + export_to_csv(all_results, filename) + + +# --- Script Entry Point --- +if __name__ == "__main__": + print("--- Link Checker Tool ---") + website = input("Enter website URL: ").strip() + # Ensure the URL has a scheme (http or https) + if not website.startswith("http"): + website = "https://" + website + check_all_links(website) \ No newline at end of file diff --git a/LinkChecker/readme.md b/LinkChecker/readme.md new file mode 100644 index 00000000..807e2e1a --- /dev/null +++ b/LinkChecker/readme.md @@ -0,0 +1,76 @@ +# Link Checker + +A Python script that scans a website and detects broken links. + +The tool extracts all links from a webpage, checks their HTTP status code, and reports whether they are working or broken. Results can also be exported to a CSV file. + +## Features + +- Extracts all links from a webpage +- Detects broken and working links +- Shows HTTP status codes +- Displays a summary report +- Optional CSV export of results + +## Requirements + +- Python 3.8+ + +Install dependencies: + +```bash +pip install -r requirements.txt +``` + +## Usage + +Run the script: + +```bash +python link_checker.py +``` + +Enter a website URL when prompted: + +``` +Enter website URL: https://example.com +``` + +## Example Output + +``` +--- Link Checker Tool --- + +Fetching page: https://example.com +Found 25 links. + +STATUS URL +------------------------------------------------------------ +✔ OK https://example.com/about +⚠️ Redirect https://example.com/docs +❌ Not Found https://example.com/old-page + +SUMMARY +-------------------- +Total Links: 25 +Working: 23 +Broken: 2 +``` + +## CSV Export + +After scanning, you can save results to a CSV file: + +``` +Save results to CSV? (y/n): +``` + +A report like this will be generated: + +``` +link_report_20260307_143210.csv +``` + +## License + +MIT License \ No newline at end of file diff --git a/LinkChecker/requirements.txt b/LinkChecker/requirements.txt new file mode 100644 index 00000000..a98ae430 --- /dev/null +++ b/LinkChecker/requirements.txt @@ -0,0 +1,2 @@ +requests +beautifulsoup4 \ No newline at end of file diff --git a/README.md b/README.md index 5333ecbf..a46aaa50 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ More information on contributing and the general code of conduct for discussion | JSON to YAML converter | [JSON to YAML converter](https://github.com/DhanushNehru/Python-Scripts/tree/main/JSON%20to%20YAML) | Converts JSON file to YAML files. A sample JSON is included for testing. | | Keylogger | [Keylogger](https://github.com/DhanushNehru/Python-Scripts/tree/main/Keylogger) | Keylogger that can track your keystrokes, clipboard text, take screenshots at regular intervals, and records audio. | | Keyword - Retweeting | [Keyword - Retweeting](https://github.com/DhanushNehru/Python-Scripts/tree/main/Keyword%20Retweet%20Twitter%20Bot) | Find the latest tweets containing given keywords and then retweet them. | +| Link Checker | [Link Checker](https://github.com/JaivPatel07/Python-Scripts/tree/main/LinkChecker) | A Python script that scans a website and detects broken links. | | LinkedIn Bot | [LinkedIn Bot](https://github.com/DhanushNehru/Python-Scripts/tree/main/LinkedIn%20Bot) | Automates the process of searching for public profiles on LinkedIn and exporting the data to an Excel sheet. | | Longitude & Latitude to conical coverter | [Longitude Latitude conical converter](master/Longitude%20Latitude%20conical%20converter) | Converts Longitude and Latitude to Lambert conformal conic projection. | | Mail Sender | [Mail Sender](https://github.com/DhanushNehru/Python-Scripts/tree/main/Mail%20Sender) | Sends an email. |