forked from ungleich-public/ipv6ula
[sixxs] improve parsing
This commit is contained in:
parent
403b53e406
commit
1fbc9e06ff
1 changed files with 51 additions and 14 deletions
|
|
@ -15,34 +15,69 @@ class MyHTMLParser(HTMLParser):
|
|||
|
||||
self.results = []
|
||||
|
||||
self.th = False
|
||||
self.prefix = False
|
||||
self.name = False
|
||||
self.org = False
|
||||
self.website = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
# print(f"Encountered a start tag: '{tag}'")
|
||||
|
||||
if tag == "tbody":
|
||||
self.tbody_count += 1
|
||||
print(f"tbody: {self.tbody_count} {attrs}")
|
||||
if tag == "th":
|
||||
self.th = True
|
||||
|
||||
if self.tbody_count == 2:
|
||||
self.in_table = True
|
||||
print("in real table")
|
||||
|
||||
elif self.in_table and tag == "tr":
|
||||
if self.in_table and tag == "tr":
|
||||
self.in_row = True
|
||||
print("in real table")
|
||||
self.col_index = 0
|
||||
|
||||
elif self.in_row and tag == "td":
|
||||
print("td data")
|
||||
self.col_index += 1
|
||||
|
||||
if self.col_index == 2:
|
||||
self.name = True
|
||||
elif self.col_index == 3:
|
||||
self.org = True
|
||||
|
||||
elif self.col_index == 1 and tag == "a":
|
||||
self.prefix = True
|
||||
elif self.col_index == 4 and tag == "a":
|
||||
self.website = True
|
||||
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
pass
|
||||
#print("Encountered an end tag :", tag)
|
||||
if tag == "th":
|
||||
self.th = False
|
||||
elif tag == "tr":
|
||||
self.in_row = False
|
||||
|
||||
|
||||
def handle_data(self, data):
|
||||
#print("Encountered some data :", data)
|
||||
pass
|
||||
if self.th and data == "Prefix":
|
||||
print("Found table start")
|
||||
self.in_table = True
|
||||
|
||||
if self.prefix:
|
||||
self.record = {}
|
||||
self.record['prefix'] = data
|
||||
|
||||
self.results.append(self.record)
|
||||
|
||||
self.prefix = False
|
||||
|
||||
elif self.name:
|
||||
self.record['name'] = data
|
||||
self.name = False
|
||||
elif self.org:
|
||||
self.record['organization'] = data
|
||||
self.org = False
|
||||
elif self.website:
|
||||
self.record['website'] = data
|
||||
self.website = False
|
||||
|
||||
def report(self):
|
||||
for record in self.results:
|
||||
print(record)
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = "https://www.sixxs.net/tools/grh/ula/list/"
|
||||
|
|
@ -51,3 +86,5 @@ if __name__ == '__main__':
|
|||
html = "\n".join([ line.decode('utf-8') for line in response.readlines() ])
|
||||
|
||||
parser.feed(html)
|
||||
|
||||
parser.report()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue