From 6e2e7436597cb5c00c3a0013253dfcf270904e60 Mon Sep 17 00:00:00 2001 From: evilcel3ri Date: Sat, 4 May 2024 14:25:31 +0200 Subject: [PATCH] first commit --- .gitignore | 1 + README.md | 21 +++++++++++++++ requirements.txt | 2 ++ script.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ webpage_data.csv | 4 +++ 5 files changed, 95 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 script.py create mode 100644 webpage_data.csv diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2a8dea --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..62a9032 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +Install: + +``` +python -m venv -p python39 .env +source .env/bin/activate +``` + +Usage: + +``` +python script.py URL +``` + +Tested urls: + +OK: +https://www.isuresults.com/bios/isufs00103105.htm +https://www.isuresults.com/bios/isufs00103105.htm + +No: +https://www.isuresults.com/bios/isufs00103105.htm \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1190bd8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +beautifulsoup4 diff --git a/script.py b/script.py new file mode 100644 index 0000000..2d9fed2 --- /dev/null +++ b/script.py @@ -0,0 +1,67 @@ +import sys +import requests +from bs4 import BeautifulSoup +import csv +import urllib3 +import os + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +def parse_webpage(url): + response = requests.get(url, verify=False) + + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + + discipline_type = soup.find('span', id=lambda x: x and x.endswith('_CategoryLabel')) + discipline_type_cell = discipline_type.get_text(strip=True) if discipline_type else '' + # for nestedBiography and biography when single athelete + tables = soup.find_all('table', class_=lambda x: x and x.endswith('iography')) + # probably not needed, need check + table_coach = soup.find('table', class_='biography') + + unique_keys = set() + + for table in tables: + name_cells = table.find_all('span', id=lambda x: x and x.endswith(('_person_cnameLabel'))) + first_names = [] + last_names = [] + for name_cell in name_cells: + name_parts = name_cell.get_text(strip=True).split() + first_names.append(name_parts[0]) + last_names.append(' '.join(part for part in name_parts[1:] if part.isupper())) + + birth_place = table.find('span', id=lambda x: x and x.endswith('_person_pobLabel')) + birth_place_cell = birth_place.get_text(strip=True) if birth_place else '' + + country = table.find('span', id=lambda x: x and x.endswith('_person_nationLabel', )) + country_cell = country.get_text(strip=True) if country else '' + + birth_date = table.find('span', id=lambda x: x and x.endswith('_person_dobLabel', )) + birth_date_cell = birth_date.get_text(strip=True) if birth_date else '' + + coach = table.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel')) + if not coach: + coach = table_coach.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel')) + + coach_cell = coach.get_text(strip=True) if coach else '' + + keys = (first_names[0], last_names[0], birth_place_cell, country_cell, birth_date_cell, discipline_type_cell, coach_cell, url.split('/')[-1].split('.')[0]) + unique_keys.add(keys) + + with open('webpage_data.csv', 'a', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + if os.stat('webpage_data.csv').st_size == 0: + writer.writerow(['First Name', 'Last Name', 'Birth Place', 'Country', 'Birth Date', 'Type', 'Coach', 'ID']) + for keys in unique_keys: + writer.writerow(keys) + + else: + print("Failed to fetch webpage") + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python script.py ") + else: + url = sys.argv[1] + parse_webpage(url) diff --git a/webpage_data.csv b/webpage_data.csv new file mode 100644 index 0000000..b5d3636 --- /dev/null +++ b/webpage_data.csv @@ -0,0 +1,4 @@ +First Name,Last Name,Birth Place,Country,Birth Date,Type,Coach,ID +Karolina,CALHOUN,Long Beach USA,BRA,29.10.1999,ICE DANCE,"Christine Binder, Vitali Novikov",isufs00103105 +Michael,VALDEZ,Pasadena USA,BRA,06.08.1997,ICE DANCE,"Christine Binder, Vitali Novikov",isufs00103105 +Mirai,NAGASU,"Montebello, CA",USA,16.04.1993,LADIES,Tom Zakrajsek,isufs00010220