Coderja 3 weeks ago
Coderja #coding

Url Extractor Python Script

import sys
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup

def extract_urls(file_input, file_output):
    try:
        with open(file_input, 'r') as input_file:
            links = input_file.read().splitlines()

        results = []

        for link in links:
            try:
                page = requests.get(link)
                soup = BeautifulSoup(page.text, "lxml")
                extlist = set()
                intlist = set()

                for a in soup.findAll("a", attrs={"href": True}):
                    if (
                        len(a["href"].strip()) > 1
                        and a["href"][0] != "#"
                        and "javascript:" not in a["href"].strip()
                        and "mailto:" not in a["href"].strip()
                        and "tel:" not in a["href"].strip()
                    ):
                        if "http" in a["href"].strip() or "https" in a["href"].strip():
                            if (
                                urlparse(link).netloc.lower()
                                in urlparse(a["href"].strip()).netloc.lower()
                            ):
                                intlist.add(a["href"])
                            else:
                                extlist.add(a["href"])
                        else:
                            intlist.add(a["href"])

                result = "\n"
                result += link + "\n"
                result += "---------------------\n\n"
                result += str(len(intlist)) + " internal links found:\n\n"
                for il in intlist:
                    result += il + "\n"
                result += "\n" + str(len(extlist)) + " external links found:\n\n"
                for el in extlist:
                    result += el + "\n"
                result += "\n"

                results.append(result)
                
            except Exception as page_error:
                results.append(f"Failed to process {link}: {page_error}\n")

        with open(file_output, 'w') as output_file:
            output_file.writelines(results)

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python3 script.py listtarget.txt hasil.txt")
    else:
        input_file = sys.argv[1]
        output_file = sys.argv[2]
        extract_urls(input_file, output_file)


  1. Fungsi dikemas dalam extract_urls untuk keterbacaan dan modularitas.
  2. Membaca dari file listtarget.txt (parameter pertama) untuk mendapatkan daftar URL.
  3. Menyimpan hasil ke file hasil.txt (parameter kedua).
  4. Menangani kesalahan secara terpisah untuk setiap URL sehingga proses tidak berhenti ketika terjadi kesalahan pada satu URL tertentu.
  5. Penanganan kesalahan umum di level utama untuk memastikan penggunaan script yang benar.
python3 script.py listtarget.txt hasil.txt


To Save Result only containing keyword
import sys
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup

def extract_urls(file_input, file_output, keyword):
    try:
        with open(file_input, 'r') as input_file:
            links = input_file.read().splitlines()

        results = []

        for link in links:
            try:
                page = requests.get(link)
                soup = BeautifulSoup(page.text, "lxml")
                extlist = set()
                intlist = set()

                for a in soup.findAll("a", attrs={"href": True}):
                    if (
                        len(a["href"].strip()) > 1
                        and a["href"][0] != "#"
                        and "javascript:" not in a["href"].strip()
                        and "mailto:" not in a["href"].strip()
                        and "tel:" not in a["href"].strip()
                    ):
                        if "http" in a["href"].strip() or "https" in a["href"].strip():
                            if (
                                urlparse(link).netloc.lower()
                                in urlparse(a["href"].strip()).netloc.lower()
                            ):
                                intlist.add(a["href"])
                            else:
                                extlist.add(a["href"])
                        else:
                            intlist.add(a["href"])

                if any(keyword in url for url in intlist) or any(keyword in url for url in extlist):
                    result = "\n"
                    result += link + "\n"
                    result += "---------------------\n\n"
                    result += str(len(intlist)) + " internal links found:\n\n"
                    for il in intlist:
                        if keyword in il:
                            result += il + "\n"
                    result += "\n" + str(len(extlist)) + " external links found:\n\n"
                    for el in extlist:
                        if keyword in el:
                            result += el + "\n"
                    result += "\n"

                    results.append(result)
                
            except Exception as page_error:
                results.append(f"Failed to process {link}: {page_error}\n")

        with open(file_output, 'w') as output_file:
            output_file.writelines(results)

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: python3 script.py listtarget.txt hasil.txt keyword")
    else:
        input_file = sys.argv[1]
        output_file = sys.argv[2]
        keyword = sys.argv[3]
        extract_urls(input_file, output_file, keyword)

Run

python3 script.py listtarget.txt hasil.txt keyword
Save result excluding keyword
import sys
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup

def extract_urls(file_input, file_output, keyword):
    try:
        with open(file_input, 'r') as input_file:
            links = input_file.read().splitlines()

        results = []

        for link in links:
            try:
                page = requests.get(link)
                soup = BeautifulSoup(page.text, "lxml")
                extlist = set()
                intlist = set()

                for a in soup.findAll("a", attrs={"href": True}):
                    if (
                        len(a["href"].strip()) > 1
                        and a["href"][0] != "#"
                        and "javascript:" not in a["href"].strip()
                        and "mailto:" not in a["href"].strip()
                        and "tel:" not in a["href"].strip()
                    ):
                        if "http" in a["href"].strip() or "https" in a["href"].strip():
                            if (
                                urlparse(link).netloc.lower()
                                in urlparse(a["href"].strip()).netloc.lower()
                            ):
                                intlist.add(a["href"])
                            else:
                                extlist.add(a["href"])
                        else:
                            intlist.add(a["href"])

                result = "\n"
                result += link + "\n"
                result += "---------------------\n\n"
                result += str(len(intlist)) + " internal links found:\n\n"
                for il in intlist:
                    if keyword not in il:
                        result += il + "\n"
                result += "\n" + str(len(extlist)) + " external links found:\n\n"
                for el in extlist:
                    if keyword not in el:
                        result += el + "\n"
                result += "\n"

                if (len(intlist) > 0 and any(keyword not in url for url in intlist)) or (len(extlist) > 0 and any(keyword not in url for url in extlist)):
                    results.append(result)

            except Exception as page_error:
                results.append(f"Failed to process {link}: {page_error}\n")

        with open(file_output, 'w') as output_file:
            output_file.writelines(results)

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: python3 script.py listtarget.txt hasil.txt keyword")
    else:
        input_file = sys.argv[1]
        output_file = sys.argv[2]
        keyword = sys.argv[3]
        extract_urls(input_file, output_file, keyword)

Run

python3 script.py listtarget.txt hasil.txt keyword
Mass Users Password Reset for wordpress

Mass Users Password Reset for wordpress

defaultuser.png
Coderja
3 weeks ago
Installing Plugin JWT and Activate

Installing Plugin JWT and Activate

defaultuser.png
Coderja
2 weeks ago
Menghilangkan Spasi Di depan Baris

Menghilangkan Spasi Di depan Baris

defaultuser.png
Coderja
2 weeks ago

AWS DATA

defaultuser.png
Coderja
18 hours ago

Check File

This script will help you efficiently check for the existence of file across multiple URLs...

defaultuser.png
Coderja
6 days ago