import sys from urllib.parse import urlparse import requests from bs4 import BeautifulSoup def extract_urls(file_input, file_output): try: with open(file_input, 'r') as input_file: links = input_file.read().splitlines() results = [] for link in links: try: page = requests.get(link) soup = BeautifulSoup(page.text, "lxml") extlist = set() intlist = set() for a in soup.findAll("a", attrs={"href": True}): if ( len(a["href"].strip()) > 1 and a["href"][0] != "#" and "javascript:" not in a["href"].strip() and "mailto:" not in a["href"].strip() and "tel:" not in a["href"].strip() ): if "http" in a["href"].strip() or "https" in a["href"].strip(): if ( urlparse(link).netloc.lower() in urlparse(a["href"].strip()).netloc.lower() ): intlist.add(a["href"]) else: extlist.add(a["href"]) else: intlist.add(a["href"]) result = "\n" result += link + "\n" result += "---------------------\n\n" result += str(len(intlist)) + " internal links found:\n\n" for il in intlist: result += il + "\n" result += "\n" + str(len(extlist)) + " external links found:\n\n" for el in extlist: result += el + "\n" result += "\n" results.append(result) except Exception as page_error: results.append(f"Failed to process {link}: {page_error}\n") with open(file_output, 'w') as output_file: output_file.writelines(results) except Exception as e: print(f"An error occurred: {e}") if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python3 script.py listtarget.txt hasil.txt") else: input_file = sys.argv[1] output_file = sys.argv[2] extract_urls(input_file, output_file)
python3 script.py listtarget.txt hasil.txt
import sys from urllib.parse import urlparse import requests from bs4 import BeautifulSoup def extract_urls(file_input, file_output, keyword): try: with open(file_input, 'r') as input_file: links = input_file.read().splitlines() results = [] for link in links: try: page = requests.get(link) soup = BeautifulSoup(page.text, "lxml") extlist = set() intlist = set() for a in soup.findAll("a", attrs={"href": True}): if ( len(a["href"].strip()) > 1 and a["href"][0] != "#" and "javascript:" not in a["href"].strip() and "mailto:" not in a["href"].strip() and "tel:" not in a["href"].strip() ): if "http" in a["href"].strip() or "https" in a["href"].strip(): if ( urlparse(link).netloc.lower() in urlparse(a["href"].strip()).netloc.lower() ): intlist.add(a["href"]) else: extlist.add(a["href"]) else: intlist.add(a["href"]) if any(keyword in url for url in intlist) or any(keyword in url for url in extlist): result = "\n" result += link + "\n" result += "---------------------\n\n" result += str(len(intlist)) + " internal links found:\n\n" for il in intlist: if keyword in il: result += il + "\n" result += "\n" + str(len(extlist)) + " external links found:\n\n" for el in extlist: if keyword in el: result += el + "\n" result += "\n" results.append(result) except Exception as page_error: results.append(f"Failed to process {link}: {page_error}\n") with open(file_output, 'w') as output_file: output_file.writelines(results) except Exception as e: print(f"An error occurred: {e}") if __name__ == "__main__": if len(sys.argv) != 4: print("Usage: python3 script.py listtarget.txt hasil.txt keyword") else: input_file = sys.argv[1] output_file = sys.argv[2] keyword = sys.argv[3] extract_urls(input_file, output_file, keyword)
Run
python3 script.py listtarget.txt hasil.txt keywordSave result excluding keyword
import sys from urllib.parse import urlparse import requests from bs4 import BeautifulSoup def extract_urls(file_input, file_output, keyword): try: with open(file_input, 'r') as input_file: links = input_file.read().splitlines() results = [] for link in links: try: page = requests.get(link) soup = BeautifulSoup(page.text, "lxml") extlist = set() intlist = set() for a in soup.findAll("a", attrs={"href": True}): if ( len(a["href"].strip()) > 1 and a["href"][0] != "#" and "javascript:" not in a["href"].strip() and "mailto:" not in a["href"].strip() and "tel:" not in a["href"].strip() ): if "http" in a["href"].strip() or "https" in a["href"].strip(): if ( urlparse(link).netloc.lower() in urlparse(a["href"].strip()).netloc.lower() ): intlist.add(a["href"]) else: extlist.add(a["href"]) else: intlist.add(a["href"]) result = "\n" result += link + "\n" result += "---------------------\n\n" result += str(len(intlist)) + " internal links found:\n\n" for il in intlist: if keyword not in il: result += il + "\n" result += "\n" + str(len(extlist)) + " external links found:\n\n" for el in extlist: if keyword not in el: result += el + "\n" result += "\n" if (len(intlist) > 0 and any(keyword not in url for url in intlist)) or (len(extlist) > 0 and any(keyword not in url for url in extlist)): results.append(result) except Exception as page_error: results.append(f"Failed to process {link}: {page_error}\n") with open(file_output, 'w') as output_file: output_file.writelines(results) except Exception as e: print(f"An error occurred: {e}") if __name__ == "__main__": if len(sys.argv) != 4: print("Usage: python3 script.py listtarget.txt hasil.txt keyword") else: input_file = sys.argv[1] output_file = sys.argv[2] keyword = sys.argv[3] extract_urls(input_file, output_file, keyword)
Run
python3 script.py listtarget.txt hasil.txt keyword