[sixxs] improve parsing

This commit is contained in:
Nico Schottelius 2020-12-05 12:49:11 +01:00
commit 1fbc9e06ff

View file

@ -15,34 +15,69 @@ class MyHTMLParser(HTMLParser):
self.results = []
self.th = False
self.prefix = False
self.name = False
self.org = False
self.website = False
def handle_starttag(self, tag, attrs):
# print(f"Encountered a start tag: '{tag}'")
if tag == "tbody":
self.tbody_count += 1
print(f"tbody: {self.tbody_count} {attrs}")
if tag == "th":
self.th = True
if self.tbody_count == 2:
self.in_table = True
print("in real table")
elif self.in_table and tag == "tr":
if self.in_table and tag == "tr":
self.in_row = True
print("in real table")
self.col_index = 0
elif self.in_row and tag == "td":
print("td data")
self.col_index += 1
if self.col_index == 2:
self.name = True
elif self.col_index == 3:
self.org = True
elif self.col_index == 1 and tag == "a":
self.prefix = True
elif self.col_index == 4 and tag == "a":
self.website = True
def handle_endtag(self, tag):
pass
#print("Encountered an end tag :", tag)
if tag == "th":
self.th = False
elif tag == "tr":
self.in_row = False
def handle_data(self, data):
#print("Encountered some data :", data)
pass
if self.th and data == "Prefix":
print("Found table start")
self.in_table = True
if self.prefix:
self.record = {}
self.record['prefix'] = data
self.results.append(self.record)
self.prefix = False
elif self.name:
self.record['name'] = data
self.name = False
elif self.org:
self.record['organization'] = data
self.org = False
elif self.website:
self.record['website'] = data
self.website = False
def report(self):
for record in self.results:
print(record)
if __name__ == '__main__':
url = "https://www.sixxs.net/tools/grh/ula/list/"
@ -51,3 +86,5 @@ if __name__ == '__main__':
html = "\n".join([ line.decode('utf-8') for line in response.readlines() ])
parser.feed(html)
parser.report()