Last modified: Nov 16, 2023 By Alexander Williams
Python: Check Duplicate Lines in a File - Examples
Example 1: Basic Line Comparison
def has_duplicates(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
return len(lines) != len(set(lines))
# Example usage
file_path = 'example.txt'
if has_duplicates(file_path):
print(f"The file '{file_path}' contains duplicate lines.")
else:
print(f"The file '{file_path}' does not contain duplicate lines.")
Output:
# The file 'example.txt' contains duplicate lines.
Example 2: Using Counter from Collections
from collections import Counter
def has_duplicates_counter(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
line_counts = Counter(lines)
return any(count > 1 for count in line_counts.values())
# Example usage
file_path = 'example.txt'
if has_duplicates_counter(file_path):
print(f"The file '{file_path}' contains duplicate lines.")
else:
print(f"The file '{file_path}' does not contain duplicate lines.")
Output:
# The file 'example.txt' contains duplicate lines.
Example 3: Using Set to Find Duplicates
def find_duplicates_set(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
seen = set()
duplicates = set(line for line in lines if line in seen or seen.add(line))
return bool(duplicates)
# Example usage
file_path = 'example.txt'
if find_duplicates_set(file_path):
print(f"The file '{file_path}' contains duplicate lines.")
else:
print(f"The file '{file_path}' does not contain duplicate lines.")
Output:
# The file 'example.txt' contains duplicate lines.
Example 4: Sorting Lines for Efficiency
def has_duplicates_sorted(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
sorted_lines = sorted(lines)
return any(sorted_lines[i] == sorted_lines[i + 1] for i in range(len(sorted_lines) - 1))
# Example usage
file_path = 'example.txt'
if has_duplicates_sorted(file_path):
print(f"The file '{file_path}' contains duplicate lines.")
else:
print(f"The file '{file_path}' does not contain duplicate lines.")
Output:
# The file 'example.txt' contains duplicate lines.
Example 5: Using Set and List Comprehension
def has_duplicates_set_comprehension(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
seen = set()
return any(line in seen or seen.add(line) for line in lines)
# Example usage
file_path = 'example.txt'
if has_duplicates_set_comprehension(file_path):
print(f"The file '{file_path}' contains duplicate lines.")
else:
print(f"The file '{file_path}' does not contain duplicate lines.")
Output:
# The file 'example.txt' contains duplicate lines.
Example 6: Using defaultdict for Line Counting
from collections import defaultdict
def has_duplicates_defaultdict(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
line_counts = defaultdict(int)
for line in lines:
line_counts[line] += 1
if line_counts[line] > 1:
return True
return False
# Example usage
file_path = 'example.txt'
if has_duplicates_defaultdict(file_path):
print(f"The file '{file_path}' contains duplicate lines.")
else:
print(f"The file '{file_path}' does not contain duplicate lines.")
Output:
# The file 'example.txt' contains duplicate lines.