# 6. Year detection (1900-2030) years = [n for n in numbers if 1900 <= n <= 2030] features['years_found'] = years
# 7. File extension hint if 'txt' in tokens: features['file_extension'] = 'txt' features['looks_like_filename'] = True else: features['looks_like_filename'] = False
It looks like you’re asking to build a from a raw string of mixed data:
# 8. Pairwise patterns (bigrams) bigrams = [' '.join(tokens[i:i+2]) for i in range(len(tokens)-1)] features['bigrams'] = bigrams
# 9. Embedded feature: "year + number" combo if len(years) == 1 and len(numbers) > 1: other_nums = [n for n in numbers if n not in years] if other_nums: features['year_num_pair'] = (years[0], other_nums[0])
features = {}