Extract hashtags from text with Python

Aug. 27, 2025 Python Script

This is a snippet I was using to extract hashtags from text. So for instance, if I was typing up a note, I could type #tags into my content and then find them all without storing them in a separate field or data structure somewhere.

Code

import re
from typing import Iterator

import markdown

# Compile the regex pattern
# HASHTAG_PATTERN = re.compile(r"(?<!\S)#([a-z0-9][-_a-z0-9]*[a-z0-9])(?!\])")
HASHTAG_PATTERN = re.compile(r"(?:^|\s)#([a-z0-9_][a-z0-9_-]*)\b")


def string_is_integer(text_string: str):
    """Returns true if the text string is an integer."""
    try:
        int(text_string)
        return True
    except ValueError:
        return False


def iter_tags(text: str) -> Iterator[str]:
    """Yields hashtags from a text string.

    This function finds all potential hashtags (e.g. #topic) and
    excludes any that are purely numeric (e.g. #123)."""
    for match in HASHTAG_PATTERN.finditer(text):
        tag = match.group(1)

        # Skip integers
        if string_is_integer(tag):
            continue
        # Skip "_"
        if "_" in tag:
            continue

        yield tag


def extract_tags(text: str) -> list[str]:
    """Returns a list of hashtags in a string."""

    return [x for x in iter_tags(text)]

Tests

@pytest.mark.parametrize(
    "input_text, expected_tags",
    [
        # Test case 1: Simple text with one tag
        ("Hello #world, this is a test.", ["world"]),
        # Test case 2: Text with multiple tags
        ("I love #python and #coding.", ["python", "coding"]),
        # Test case 3: Numeric tag should be excluded
        ("This is pull request #12345.", []),
        # Test case 4: Mix of valid and numeric tags
        ("Let's talk about #ai-ethics but not issue #99.", ["ai-ethics"]),
        # Test case 5: Tag at the beginning of the text
        ("#start of a sentence.", ["start"]),
        # Test case 6: No tags in the text
        ("This is a string with no hashtags.", []),
        # Test case 7: Empty string input
        ("", []),
        # Test case 8: Tag with hyphens and numbers
        ("Check out #version-2-release.", ["version-2-release"]),
        # Test case 9: Tag attached to punctuation
        ("A great resource is #stackoverflow.", ["stackoverflow"]),
        # Test case 10: Tag must be preceded by space (negative lookbehind)
        ("An email#tag should not match.", []),
        # Test case 11: Tag followed by a square bracket (negative lookahead)
        ("A link like this [link](#test)] should not match.", []),
        # Test case 12: Multiple numeric tags
        ("We have #100 and #200, both are just numbers.", []),
        # Test case 13: Doesn't include tags with underscores
        ("This calls for a #beach_party and #_fun!", []),
        # Tes case 14: Tags must start with a letter or number
        ("This #-tag and #_test and #this or #1nine", ["this", "1nine"]),
    ],
)
def test_iter_tags(input_text: str, expected_tags: list[str]):
    """
    Tests the iter_tags function with various inputs to ensure it
    correctly extracts and filters hashtags.

    Args:
        input_text: The string to pass to the iter_tags function.
        expected_tags: A list of strings we expect the function to yield.
    """
    # Act: Call the function and convert the iterator to a list
    actual_tags = list(iter_tags(input_text))

    # Assert: Check if the actual output matches the expected output
    assert actual_tags == expected_tags