Add findRedundantDomain.py script

pull/52/head
loyalsoldier 2020-07-31 12:56:49 +08:00
commit aaf73868db
No known key found for this signature in database
GPG Key ID: 23829BBC1ACF2C90
1 changed files with 70 additions and 0 deletions

70
findRedundantDomain.py Normal file
View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
import sys
print(sys.argv[1], sys.argv[2])
''' Find redundant items in domain lists.
e.g. 'bar.foo.com' is redundant for 'foo.com'.
'''
def load(list):
''' Parse conf file & Prepare data structure
Returns: [ ['abc', 'com'],
['bar', 'foo', 'com'],
... ]
'''
results = []
with open(list, 'r') as f:
for line in f.readlines():
line = line.strip()
if line == '' or line.startswith('#'):
continue
# A domain name is case-insensitive and
# consists of several labels, separated by a full stop
domain_labels = line.lower().split('.')
results.append(domain_labels)
# Sort results by domain labels' length
results.sort(key=len)
return results
def find(labelses, removedDomainFile):
''' Find redundant items by a tree of top-level domain label to sub-level.
`tree` is like { 'com': { 'foo: { 'bar': LEAF },
'abc': LEAF },
'org': ... }
'''
tree = {}
LEAF = 1
for labels in labelses:
domain = '.'.join(labels)
# Init root node as current node
node = tree
while len(labels) > 0:
label = labels.pop()
if label in node:
# If child node is a LEAF node,
# current domain must be an existed domain or a subdomain of an existed.
if node[label] == LEAF:
print(f"Redundant found: {domain} at {'.'.join(labels)}")
with open(removedDomainFile, "a") as f:
f.write(domain)
f.write("\n")
break
else:
# Create a leaf node if current label is last one
if len(labels) == 0:
node[label] = LEAF
# Create a branch node
else:
node[label] = {}
# Iterate to child node
node = node[label]
if __name__ == '__main__':
find(load(sys.argv[1]), sys.argv[2])