first commit
This commit is contained in:
commit
6e2e743659
5 changed files with 95 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
.env/
|
21
README.md
Normal file
21
README.md
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
Install:
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m venv -p python39 .env
|
||||||
|
source .env/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
```
|
||||||
|
python script.py URL
|
||||||
|
```
|
||||||
|
|
||||||
|
Tested urls:
|
||||||
|
|
||||||
|
OK:
|
||||||
|
https://www.isuresults.com/bios/isufs00103105.htm
|
||||||
|
https://www.isuresults.com/bios/isufs00103105.htm
|
||||||
|
|
||||||
|
No:
|
||||||
|
https://www.isuresults.com/bios/isufs00103105.htm
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
requests
|
||||||
|
beautifulsoup4
|
67
script.py
Normal file
67
script.py
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
import sys
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import csv
|
||||||
|
import urllib3
|
||||||
|
import os
|
||||||
|
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
def parse_webpage(url):
|
||||||
|
response = requests.get(url, verify=False)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
discipline_type = soup.find('span', id=lambda x: x and x.endswith('_CategoryLabel'))
|
||||||
|
discipline_type_cell = discipline_type.get_text(strip=True) if discipline_type else ''
|
||||||
|
# for nestedBiography and biography when single athelete
|
||||||
|
tables = soup.find_all('table', class_=lambda x: x and x.endswith('iography'))
|
||||||
|
# probably not needed, need check
|
||||||
|
table_coach = soup.find('table', class_='biography')
|
||||||
|
|
||||||
|
unique_keys = set()
|
||||||
|
|
||||||
|
for table in tables:
|
||||||
|
name_cells = table.find_all('span', id=lambda x: x and x.endswith(('_person_cnameLabel')))
|
||||||
|
first_names = []
|
||||||
|
last_names = []
|
||||||
|
for name_cell in name_cells:
|
||||||
|
name_parts = name_cell.get_text(strip=True).split()
|
||||||
|
first_names.append(name_parts[0])
|
||||||
|
last_names.append(' '.join(part for part in name_parts[1:] if part.isupper()))
|
||||||
|
|
||||||
|
birth_place = table.find('span', id=lambda x: x and x.endswith('_person_pobLabel'))
|
||||||
|
birth_place_cell = birth_place.get_text(strip=True) if birth_place else ''
|
||||||
|
|
||||||
|
country = table.find('span', id=lambda x: x and x.endswith('_person_nationLabel', ))
|
||||||
|
country_cell = country.get_text(strip=True) if country else ''
|
||||||
|
|
||||||
|
birth_date = table.find('span', id=lambda x: x and x.endswith('_person_dobLabel', ))
|
||||||
|
birth_date_cell = birth_date.get_text(strip=True) if birth_date else ''
|
||||||
|
|
||||||
|
coach = table.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))
|
||||||
|
if not coach:
|
||||||
|
coach = table_coach.find('span', id=lambda x: x and x.endswith('_person_media_information_coachLabel'))
|
||||||
|
|
||||||
|
coach_cell = coach.get_text(strip=True) if coach else ''
|
||||||
|
|
||||||
|
keys = (first_names[0], last_names[0], birth_place_cell, country_cell, birth_date_cell, discipline_type_cell, coach_cell, url.split('/')[-1].split('.')[0])
|
||||||
|
unique_keys.add(keys)
|
||||||
|
|
||||||
|
with open('webpage_data.csv', 'a', newline='', encoding='utf-8') as csvfile:
|
||||||
|
writer = csv.writer(csvfile)
|
||||||
|
if os.stat('webpage_data.csv').st_size == 0:
|
||||||
|
writer.writerow(['First Name', 'Last Name', 'Birth Place', 'Country', 'Birth Date', 'Type', 'Coach', 'ID'])
|
||||||
|
for keys in unique_keys:
|
||||||
|
writer.writerow(keys)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("Failed to fetch webpage")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: python script.py <url>")
|
||||||
|
else:
|
||||||
|
url = sys.argv[1]
|
||||||
|
parse_webpage(url)
|
4
webpage_data.csv
Normal file
4
webpage_data.csv
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
First Name,Last Name,Birth Place,Country,Birth Date,Type,Coach,ID
|
||||||
|
Karolina,CALHOUN,Long Beach USA,BRA,29.10.1999,ICE DANCE,"Christine Binder, Vitali Novikov",isufs00103105
|
||||||
|
Michael,VALDEZ,Pasadena USA,BRA,06.08.1997,ICE DANCE,"Christine Binder, Vitali Novikov",isufs00103105
|
||||||
|
Mirai,NAGASU,"Montebello, CA",USA,16.04.1993,LADIES,Tom Zakrajsek,isufs00010220
|
|
Loading…
Reference in a new issue