7.17. Regex Quantifier Recap

  • Quantifier specifies how many occurrences of preceding qualifier or character class

  • Lazy

7.17.1. SetUp

>>> import re

7.17.2. Numbers

  • r'\d+' - Greedy

  • r'\d+?' - Lazy

>>> string = 'On Sun, Jan 1st, 2000 at 12:00 AM Alice <alice@example.com> wrote'
>>> re.findall(r'\d{1,}', string)
['1', '2000', '12', '00']
>>> string = 'On Sun, Jan 1st, 2000 at 12:00 AM Alice <alice@example.com> wrote'
>>> re.findall(r'\d{1,}?', string)
['1', '2', '0', '0', '0', '1', '2', '0', '0']

7.17.3. Strings

  • r'[A-Z].+\.' - Greedy

  • r'[A-Z].+?\.' - Lazy

>>> string = 'Email from Alice. Received on Sunday.'
>>> sentence = r'[A-Z].+\.'
>>> re.findall(sentence, string)
['Email from Alice. Received on Sunday.']
>>> string = 'Email from Alice. Received on Sunday.'
>>> sentence = r'[A-Z].+?\.'
>>> re.findall(sentence, string)
['Email from Alice.', 'Received on Sunday.']

Mind the number of sentences in each case. Without lazy quantifier it returns only one result: from first capital letter to the last possible dot. Lazy quantifier splits string into two parts. From the first capital letter to the closest dot.

7.17.4. Greedy vs. Lazy

  • r'\d+' - Greedy

  • r'\d+?' - Lazy

>>> string = 'On Sun, Jan 1st, 2000 at 12:00 AM Alice <alice@example.com> wrote'
>>> re.findall(r'\d+', string)
['1', '2000', '12', '00']
>>> string = 'On Sun, Jan 1st, 2000 at 12:00 AM Alice <alice@example.com> wrote'
>>> re.findall(r'\d+?', string)
['1', '2', '0', '0', '0', '1', '2', '0', '0']
>>> string = 'Email from Alice. Received on Sunday.'
>>> sentence = r'[A-Z].+\.'
>>> re.findall(sentence, string)
['Email from Alice. Received on Sunday.']
>>> string = 'Email from Alice. Received on Sunday.'
>>> sentence = r'[A-Z].+?\.'
>>> re.findall(sentence, string)
['Email from Alice.', 'Received on Sunday.']

Mind the number of sentences in each case. Without lazy quantifier it returns only one result: from first capital letter to the last possible dot. Lazy quantifier splits string into two parts. From the first capital letter to the closest dot.

7.17.5. Case Study 1

import re

html = '<p>Hello World</p>'

re.findall(r'<p>', html)
# ['<p>']

re.findall(r'</p>', html)
# ['</p>']

re.findall(r'</{0,1}p>', html)
# ['<p>', '</p>']

re.findall(r'</?p>', html)
# ['<p>', '</p>']

7.17.6. Case Study 2

import re

CODE = """

name = "Alice"
print(name)

"""

# %%

variable = re.findall(r'^(\w{1,}) =', CODE, flags=re.MULTILINE)
print(variable)

# %%

variable = re.findall(r'^(\w+) =', CODE, flags=re.MULTILINE)
print(variable)

import re

CODE = """

name ="Alice"
print(name)

"""

# %%

variable = re.findall(r'^(\w{1,})\s{0,1}=\s{0,1}', CODE, flags=re.MULTILINE)
print(variable)

# %%

variable = re.findall(r'^(\w+)\s?=\s?', CODE, flags=re.MULTILINE)
print(variable)

import re

CODE = """

name ="Alice"
print(name)

"""

# %%

variable = re.findall(r'^(\w{1,})\s{0,1}=\s{0,1}".{0,}"', CODE, flags=re.MULTILINE)
print(variable)

# %%

variable = re.findall(r'^(\w+)\s?=\s?".*"', CODE, flags=re.MULTILINE)
print(variable)


# name = ""
# name = "a"
# name = "abc"
import re

CODE = """

name =      "Alice"
print(name)

"""

# %%

variable = re.findall(r'^(\w{1,})\s{0,}=\s{0,}".{0,}"', CODE, flags=re.MULTILINE)
print(variable)

# %%

variable = re.findall(r'^(\w+)\s*=\s*".*"', CODE, flags=re.MULTILINE)
print(variable)


# name = ""
# name = "a"
# name = "abc"

7.17.7. Case Study 3

import re

html = '<p>Hello World</p>'


# %%

re.findall(r'<.+>', html)
# ['<p>Hello World</p>']

# %%

re.findall(r'<.+?>', html)
# ['<p>', '</p>']

7.17.8. Case Study 4

import re
from pprint import pprint

string = 'Litwo. Ojczyzno moja. Ty jesteś jak zdrowie. Ile cię trzeba cenić, ten tylko się dowie, kto cię stracił. [bla, bla, bla] I ja tam z gośćmi byłem, miód i wino piłem. A com wiedział i słyszał, w księgi umieściłem.'


# %% Greedy

zdania = re.findall(r'[A-Z].+\.', string)  # do najdalszej kropki

len(zdania)
# 1

print(zdania)
# ['Litwo. Ojczyzno moja. Ty jesteś jak zdrowie. Ile cię trzeba cenić, ten tylko się dowie, kto cię stracił. [bla, bla, bla] I ja tam z gośćmi byłem, miód i wino piłem. A com wiedział i słyszał, w księgi umieściłem.']


# %% Lazy

zdania = re.findall(r'[A-Z].+?\.', string)  # do najbliższej kropki

len(zdania)
# 6

pprint(zdania)
# ['Litwo.',
#  'Ojczyzno moja.',
#  'Ty jesteś jak zdrowie.',
#  'Ile cię trzeba cenić, ten tylko się dowie, kto cię stracił.',
#  'I ja tam z gośćmi byłem, miód i wino piłem.',
#  'A com wiedział i słyszał, w księgi umieściłem.']

7.17.9. Use Case - 1

>>> HTML = '<p>We choose to go to the Moon</p>'
>>>
>>> tag = r'<.+>'
>>> re.findall(tag, HTML)
['<p>We choose to go to the Moon</p>']
>>>
>>> tag = r'<.+?>'
>>> re.findall(tag, HTML)
['<p>', '</p>']

7.17.10. Use Case - 2

>>> import re
>>> HTML = '<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>'
>>> re.findall(r'<p>.*</p>', HTML)
['<p>Paragraph 1</p><p>Paragraph 2</p>']
>>> re.findall(r'<p>.*?</p>', HTML)
['<p>Paragraph 1</p>', '<p>Paragraph 2</p>']

7.17.11. Use Case - 3

>>> import re
>>> HTML = '<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>'
>>> re.findall(r'</?.*>', HTML)
['<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>']
>>> re.findall(r'</?.*?>', HTML)
['<h1>', '</h1>', '<p>', '</p>', '<p>', '</p>']

7.17.12. Use Case - 4

>>> import re
>>> HTML = '<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>'
>>> re.findall(r'<.+>', HTML)
['<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>']
>>> re.findall(r'<.+?>', HTML)
['<h1>', '</h1>', '<p>', '</p>', '<p>', '</p>']
>>> re.findall(r'</?.+?>', HTML)
['<h1>', '</h1>', '<p>', '</p>', '<p>', '</p>']
>>> re.findall(r'</?(.+?)>', HTML)
['h1', 'h1', 'p', 'p', 'p', 'p']
>>> tags = re.findall(r'</?(.+?)>', HTML)
>>> sorted(set(tags))
['h1', 'p']

7.17.13. Use Case - 1

  • Float

>>> string = 'Pi number is 3.1415...'
>>> pi = re.findall(r'\d+\.\d+', string)
>>>
>>> pi
['3.1415']

7.17.14. Use Case - 2

>>> string = 'On Sun, Jan 1st, 2000 at 12:00 AM Alice <alice@example.com> wrote'
>>> result = re.findall(r'\d\d:\d\d', string)
>>>
>>> result
['12:00']

7.17.15. Use Case - 3

>>> string = 'On Sun, Jan 1st, 2000 at 12:00 AM Alice <alice@example.com> wrote'
>>> result = re.findall(r'\w{3} \d{1,2}st, \d{4}', string)
>>>
>>> result
['Jan 1st, 2000']

7.17.16. Use Case - 4

>>> line = 'value=123'
>>>
>>> re.findall(r'(\w+)\s?=\s?(\d+)', line)
[('value', '123')]
>>> line = 'value = 123'
>>>
>>> re.findall(r'(\w+)\s?=\s?(\d+)', line)
[('value', '123')]

7.17.17. Use Case - 6

>>> HTML = '<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>'
>>> re.findall(r'<p>', HTML)
['<p>', '<p>']
>>> re.findall(r'</p>', HTML)
['</p>', '</p>']
>>> re.findall(r'</?p>', HTML)
['<p>', '</p>', '<p>', '</p>']

7.17.18. Use Case - 1

>>> HTML = '<p>We choose to go to the Moon</p>'
>>>
>>> tag = r'<.+>'
>>> re.findall(tag, HTML)
['<p>We choose to go to the Moon</p>']
>>>
>>> tag = r'<.+?>'
>>> re.findall(tag, HTML)
['<p>', '</p>']

7.17.19. Use Case - 2

>>> import re
>>> HTML = '<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>'
>>> re.findall(r'<p>.*</p>', HTML)
['<p>Paragraph 1</p><p>Paragraph 2</p>']
>>> re.findall(r'<p>.*?</p>', HTML)
['<p>Paragraph 1</p>', '<p>Paragraph 2</p>']

7.17.20. Use Case - 3

>>> import re
>>> HTML = '<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>'
>>> re.findall(r'</?.*>', HTML)
['<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>']
>>> re.findall(r'</?.*?>', HTML)
['<h1>', '</h1>', '<p>', '</p>', '<p>', '</p>']

7.17.21. Use Case - 4

>>> import re
>>> HTML = '<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>'
>>> re.findall(r'<.+>', HTML)
['<h1>Header 1</h1><p>Paragraph 1</p><p>Paragraph 2</p>']
>>> re.findall(r'<.+?>', HTML)
['<h1>', '</h1>', '<p>', '</p>', '<p>', '</p>']
>>> re.findall(r'</?.+?>', HTML)
['<h1>', '</h1>', '<p>', '</p>', '<p>', '</p>']
>>> re.findall(r'</?(.+?)>', HTML)
['h1', 'h1', 'p', 'p', 'p', 'p']
>>> tags = re.findall(r'</?(.+?)>', HTML)
>>> sorted(set(tags))
['h1', 'p']

7.17.22. Assignments