Extract hashtags from text with Python
This is a snippet I was using to extract hashtags from text. So for instance, if I was typing up a note, I could type #tags into my content and then find them all without storing them in a separate field or data structure somewhere.
Code
import re
from typing import Iterator
import markdown
# Compile the regex pattern
# HASHTAG_PATTERN = re.compile(r"(?<!\S)#([a-z0-9][-_a-z0-9]*[a-z0-9])(?!\])")
HASHTAG_PATTERN = re.compile(r"(?:^|\s)#([a-z0-9_][a-z0-9_-]*)\b")
def string_is_integer(text_string: str):
"""Returns true if the text string is an integer."""
try:
int(text_string)
return True
except ValueError:
return False
def iter_tags(text: str) -> Iterator[str]:
"""Yields hashtags from a text string.
This function finds all potential hashtags (e.g. #topic) and
excludes any that are purely numeric (e.g. #123)."""
for match in HASHTAG_PATTERN.finditer(text):
tag = match.group(1)
# Skip integers
if string_is_integer(tag):
continue
# Skip "_"
if "_" in tag:
continue
yield tag
def extract_tags(text: str) -> list[str]:
"""Returns a list of hashtags in a string."""
return [x for x in iter_tags(text)]
Tests
@pytest.mark.parametrize(
"input_text, expected_tags",
[
# Test case 1: Simple text with one tag
("Hello #world, this is a test.", ["world"]),
# Test case 2: Text with multiple tags
("I love #python and #coding.", ["python", "coding"]),
# Test case 3: Numeric tag should be excluded
("This is pull request #12345.", []),
# Test case 4: Mix of valid and numeric tags
("Let's talk about #ai-ethics but not issue #99.", ["ai-ethics"]),
# Test case 5: Tag at the beginning of the text
("#start of a sentence.", ["start"]),
# Test case 6: No tags in the text
("This is a string with no hashtags.", []),
# Test case 7: Empty string input
("", []),
# Test case 8: Tag with hyphens and numbers
("Check out #version-2-release.", ["version-2-release"]),
# Test case 9: Tag attached to punctuation
("A great resource is #stackoverflow.", ["stackoverflow"]),
# Test case 10: Tag must be preceded by space (negative lookbehind)
("An email#tag should not match.", []),
# Test case 11: Tag followed by a square bracket (negative lookahead)
("A link like this [link](#test)] should not match.", []),
# Test case 12: Multiple numeric tags
("We have #100 and #200, both are just numbers.", []),
# Test case 13: Doesn't include tags with underscores
("This calls for a #beach_party and #_fun!", []),
# Tes case 14: Tags must start with a letter or number
("This #-tag and #_test and #this or #1nine", ["this", "1nine"]),
],
)
def test_iter_tags(input_text: str, expected_tags: list[str]):
"""
Tests the iter_tags function with various inputs to ensure it
correctly extracts and filters hashtags.
Args:
input_text: The string to pass to the iter_tags function.
expected_tags: A list of strings we expect the function to yield.
"""
# Act: Call the function and convert the iterator to a list
actual_tags = list(iter_tags(input_text))
# Assert: Check if the actual output matches the expected output
assert actual_tags == expected_tags