Last modified: Nov 16, 2023 By Alexander Williams

Python: Check Duplicate Lines in a File - Examples

Example 1: Basic Line Comparison


def has_duplicates(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        return len(lines) != len(set(lines))

# Example usage
file_path = 'example.txt'
if has_duplicates(file_path):
    print(f"The file '{file_path}' contains duplicate lines.")
else:
    print(f"The file '{file_path}' does not contain duplicate lines.")

Output:


# The file 'example.txt' contains duplicate lines.

Example 2: Using Counter from Collections


from collections import Counter

def has_duplicates_counter(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        line_counts = Counter(lines)
        return any(count > 1 for count in line_counts.values())

# Example usage
file_path = 'example.txt'
if has_duplicates_counter(file_path):
    print(f"The file '{file_path}' contains duplicate lines.")
else:
    print(f"The file '{file_path}' does not contain duplicate lines.")

Output:


# The file 'example.txt' contains duplicate lines.

Example 3: Using Set to Find Duplicates


def find_duplicates_set(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        seen = set()
        duplicates = set(line for line in lines if line in seen or seen.add(line))
        return bool(duplicates)

# Example usage
file_path = 'example.txt'
if find_duplicates_set(file_path):
    print(f"The file '{file_path}' contains duplicate lines.")
else:
    print(f"The file '{file_path}' does not contain duplicate lines.")

Output:


# The file 'example.txt' contains duplicate lines.

Example 4: Sorting Lines for Efficiency


def has_duplicates_sorted(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        sorted_lines = sorted(lines)
        return any(sorted_lines[i] == sorted_lines[i + 1] for i in range(len(sorted_lines) - 1))

# Example usage
file_path = 'example.txt'
if has_duplicates_sorted(file_path):
    print(f"The file '{file_path}' contains duplicate lines.")
else:
    print(f"The file '{file_path}' does not contain duplicate lines.")

Output:


# The file 'example.txt' contains duplicate lines.

Example 5: Using Set and List Comprehension


def has_duplicates_set_comprehension(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        seen = set()
        return any(line in seen or seen.add(line) for line in lines)

# Example usage
file_path = 'example.txt'
if has_duplicates_set_comprehension(file_path):
    print(f"The file '{file_path}' contains duplicate lines.")
else:
    print(f"The file '{file_path}' does not contain duplicate lines.")

Output:


# The file 'example.txt' contains duplicate lines.

Example 6: Using defaultdict for Line Counting


from collections import defaultdict

def has_duplicates_defaultdict(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        line_counts = defaultdict(int)
        for line in lines:
            line_counts[line] += 1
            if line_counts[line] > 1:
                return True
        return False

# Example usage
file_path = 'example.txt'
if has_duplicates_defaultdict(file_path):
    print(f"The file '{file_path}' contains duplicate lines.")
else:
    print(f"The file '{file_path}' does not contain duplicate lines.")

Output:


# The file 'example.txt' contains duplicate lines.