# Python
import os
wd_t = r'C:\Users\mypath\to\myfolder'
wd_t
## 'C:\\Users\\mypath\\to\\myfolder'
print(wd_t)
## C:\Users\mypath\to\myfolder
The purpose of the r’…’ notation is to create raw strings and use different rules for interpreting backslash escape sequences. There is no need to use r if you’re just getting the value from another variable.
In Python, a string can be treated as an array, and it’s easy to select specific pieces of the string using the position of the letters. In R, a string is a block that needs to be decomposed in order to select specific letters. The number of characters in a string is its length. Strings can be modified in various ways, which are listed in the following link: https://www.w3schools.com/python/python_ref_string.asp . Some examples are provided there.
# Python
wd_info = 'Current working directory:\n'+os.getcwd()
slc_infos = wd_info[10:20]
len(slc_infos)
## 10
slc_infos
## 'rking dire'
slc_infos = list(slc_infos)
slc_infos
# mutable
## ['r', 'k', 'i', 'n', 'g', ' ', 'd', 'i', 'r', 'e']
slc_infos[7] = '0'
slc_infos
# count
## ['r', 'k', 'i', 'n', 'g', ' ', 'd', '0', 'r', 'e']
wd_info.count('s')
# modify the string
## 1
wd_info.title()
## 'Current Working Directory:\n/Home/Peltouz/Documents/Github/M1-Py-Ds2E-Se'
wd_info.upper()
## 'CURRENT WORKING DIRECTORY:\n/HOME/PELTOUZ/DOCUMENTS/GITHUB/M1-PY-DS2E-SE'
wd_info.lower()
## 'current working directory:\n/home/peltouz/documents/github/m1-py-ds2e-se'
splt_inf = wd_info.split(': ')
splt_inf
## ['Current working directory:\n/home/peltouz/Documents/GitHub/M1-Py-DS2E-SE']
' '.join(splt_inf)
## 'Current working directory:\n/home/peltouz/Documents/GitHub/M1-Py-DS2E-SE'
import numpy as np
"".join(['a :','1','b :','2','c :','3'])
## 'a :1b :2c :3'
" ".join(['a :','1','b :','2','c :','3'])
## 'a : 1 b : 2 c : 3'
[''.join([letter,number]) for letter, number in zip(['a :','b :','c :'],['1','2','3'])]
## ['a :1', 'b :2', 'c :3']
list(map(''.join,zip(['a :','b :','c :'],['1','2','3'])))
## ['a :1', 'b :2', 'c :3']
'.'.join((np.char.array(['a :','b :','c :'])+np.char.array(['1','2','3'])).tolist())
## 'a :1.b :2.c :3'
np.char.array([['a :','b :','c :']])+'O'
## chararray([['a :O', 'b :O', 'c :O']], dtype='<U4')
# Python
import time
import psutil
import os
time_ = time.ctime(time.time())
time_info = 'Time: '
wd = os.getcwd()
wd_info = '\nCurrent working directory: '
infos = time_info+time_+wd_info+wd
print(infos)
## Time: Mon Mar 24 12:37:46 2025
## Current working directory: /home/peltouz/Documents/GitHub/M1-Py-DS2E-SE
Here is some information on the computer where this document was created:
The working directory is the path to the folder where we are currently working. If we read or write a file without specifying the exact path (only the name of the file), the program will search for the file in the current working directory.
‘\’ is the backslash character and is used to specify special characters, tabulation, new lines, backslashes or quotes, etc. When working with file paths, this can be confusing for the machine. To avoid this, we can specify that the string must be read as it is by using ‘r’ before the quotation mark.
Here is a list of escape characters that can be useful:
Escape Sequence | Description |
---|---|
\t | Insert a tab in the text at this point. |
\b | Insert a backspace in the text at this point. |
\n | Insert a newline in the text at this point. |
\r | Insert a carriage return in the text at this point. |
\f | Insert a form feed in the text at this point. |
\’ | Insert a single quote character in the text at this point. |
\” | Insert a double quote character in the text at this point. |
\\ | Insert a backslash character in the text at this point. |
# Python
def get_infos():
time_info = 'Time: '+time.ctime(time.time())
wd_info = '\nCurrent working directory: '+os.getcwd()
infos = time_info+wd_info
print(infos)
def get_infos_2():
infos = '''Time: %s
Current working directory: %s'''% (time.ctime(time.time()),
os.getcwd())
print(infos)
def get_infos_3():
infos = '''Time: {t}
Current working directory: {wd}'''.format(t = time.ctime(time.time()),
wd = os.getcwd())
print(infos)
get_infos()
## Time: Mon Mar 24 12:37:46 2025
## Current working directory: /home/peltouz/Documents/GitHub/M1-Py-DS2E-SE
get_infos_2()
## Time: Mon Mar 24 12:37:46 2025
## Current working directory: /home/peltouz/Documents/GitHub/M1-Py-DS2E-SE
get_infos_3()
## Time: Mon Mar 24 12:37:46 2025
## Current working directory: /home/peltouz/Documents/GitHub/M1-Py-DS2E-SE
'<p>We trained a large, deep convolutional neural network to classify the 1.3 million
high-resolution images in the LSVRC-2010 ImageNet training set into the 1000 different
classes. On the test data, we achieved top-1 and top-5 error rates of 39.7\% and
18.9\% which is considerably better than the previous state-of-the-art results.
The neural network, which has 60 million parameters and 500,000 neurons,
consists of five convolutional layers, some of which are followed by max-pooling
layers, and two globally connected layers with a final 1000-way softmax.
To make training faster, we used non-saturating neurons and a very efficient GPU
implementation of convolutional nets. To reduce overfitting in the globally
connected layers we employed a new regularization method that proved to be very effective.</p>'
Regular expressions (regex) are a way to express how a character sequence should be matched, and are a very flexible tool. They are commonly used in text manipulation and are used by string-searching algorithms for “find” or “find and replace” operations on strings, or for input validation.
Except for control characters (+ ? . * ^ $ ( ) [ ] { } | ), all characters match themselves. You can escape a control character by preceding it with a backslash.
Some functions in ‘re’ (Python) and ‘stringr’(R) that can be useful when working with regular expressions are:
Re | Stringr | Description |
---|---|---|
findall | str_match_all | Returns a list containing all matches |
finditer | Returns a iterator containing all matches | |
search | str_locate_all | Returns a Match object if there is a match anywhere in the string |
split | str_split | Returns a list where the string has been split at each match |
sub | str_replace_all | Replaces one or many matches with a string |
Metacharacters are characters that have a special meaning in regular expressions. Their uses are discussed in more detail in the next few sections.
Parentheses ( ) are used for grouping. Square brackets [ ] define a character class, and curly braces { } are used by a quantifier with specific limits.
Character | Description | Example |
---|---|---|
[] | A set of characters | “[a-m]” |
\ | Signals a special sequence (can also be used to escape special characters) | “\d” |
. | Any character (except newline character) | “he..o” |
^ | Starts with | “^hello” |
$ | Ends with | “world$” |
* | Zero or more occurrences | “aix*” |
+ | One or more occurrences | “aix+” |
{} | Exactly the specified number of occurrences | “al{2}” |
| | Either or | “x|y” |
() | Grouping characters (or other regexes) | “(ab)\1” matches “abab” |
# Python
import re
abstract = open("data/imagenet_abstract.txt", "r").read()
re.search('layers',abstract)
## <_sre.SRE_Match object; span=(437, 443), match='layers'>
re.findall('layers',abstract)
## ['layers', 'layers', 'layers', 'layers']
re.findall('net+','net ne nett networks')
## ['net', 'nett', 'net']
re.findall('net*','net ne nett networks')
## ['net', 'ne', 'nett', 'net']
re.sub(r'\\','',abstract)
## '<p>We trained a large, deep convolutional neural network to classify the 1.3 million high-resolution images in the LSVRC-2010 ImageNet training set into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 39.7% and 18.9% which is considerably better than the previous state-of-the-art results. The neural network, which has 60 million parameters and 500,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and two globally connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of convolutional nets. To reduce overfitting in the globally connected layers we employed a new regularization method that proved to be very effective.</p>'
re.split(' ',abstract)[:5]
## ['<p>We', 'trained', 'a', 'large,', 'deep']
Example | Description |
---|---|
^Python | Match “Python” at the start of a string |
Python$ | Match “Python” at the end of a string |
\APython | Match “Python” at the start of a string |
Python\Z | Match “Python” at the end of a string |
\bPython\b | Match “Python” at a word boundary |
\brub\B | \B is nonword boundary: match “rub” in “rube” and “ruby” but not alone |
Python(?=!) | Match “Python”, if followed by an exclamation point. |
Python(?!!) | Match “Python”, if not followed by an exclamation point. |
# Python
import re
words = ['neuron','neural','network','networks','deep convolutional neural network']
[w for w in words if re.match(r'\Aneu',w)]
## ['neuron', 'neural']
[w for w in words if re.match(r'^neu',w)]
## ['neuron', 'neural']
[w for w in words if re.match(r'works?\Z',w)]
## []
[w for w in words if re.match(r'works?$',w)]
## []
Example | Description |
---|---|
[Pp]ython | Match “Python” or “python” |
rub[ye] | Match “ruby” or “rube” |
[aeiou] | Match any one lowercase vowel |
[0-9] | Match any digit; same as [0123456789] |
[a-z] | Match any lowercase ASCII letter |
[A-Z] | Match any uppercase ASCII letter |
[a-zA-Z0-9] | Match any of the above |
[^aeiou] | Match anything other than a lowercase vowel |
[^0-9] | Match anything other than a digit |
# Python
import re
re.findall('top-[15]',abstract)
# you can also create an iterator
## ['top-1', 'top-5']
re.finditer('[0-9]',abstract)
## <callable_iterator object at 0x7f15f930e8d0>
Example | Description |
---|---|
. | Match any character except newline |
\d | Match a digit: [0-9] |
\D | Match a nondigit: [^0-9] |
\s | Match a whitespace character: [ \t\r\n\f] |
\S | Match nonwhitespace: [^ \t\r\n\f] |
\w | Match a single word character: [A-Za-z0-9_] |
\W | Match a nonword character: [^A-Za-z0-9_] |
# Python
import re
re.sub('\w','',abstract)
## '<> , . - - . , - - .\\% .\\% --- . , , , , - , - . , - . .</>'
re.search('\d',abstract)
## <_sre.SRE_Match object; span=(73, 74), match='1'>
Example | Description |
---|---|
ruby? | Match “rub” or “ruby”: the y is optional |
ruby* | Match “rub” plus 0 or more ys |
ruby+ | Match “rub” plus 1 or more ys |
\d{3} | Match exactly 3 digits |
\d{3,} | Match 3 or more digits |
\d{3,5} | Match 3, 4, or 5 digits |
Nongreedy repetition This matches the smallest number of repetitions
Example | Description |
---|---|
<.*> | Greedy repetition: matches "<python>perl>" |
<.*?> | Nongreedy: matches "<python>" in "<python>perl>" |
# Python
import re
re.findall('\d{2,4}',abstract)
## ['2010', '1000', '39', '18', '60', '500', '000', '1000']
re.findall(r'[[].*[]]',r'[[Hello] World]')
## ['[[Hello] World]']
re.findall(r'[[].*?[]]',r'[[Hello] World]')
## ['[[Hello]']
Example | Description |
---|---|
\D\d+ | No group: + repeats \d |
(\D\d)+ | Grouped: + repeats \D\d pair |
# Python
re.findall('[A-Z]\d+', "A8 DF Z7 376 A28 A0A4A8")
## ['A8', 'Z7', 'A28', 'A0', 'A4', 'A8']
re.findall('([A-Z]\d)+', "A8 DF Z7 376 A28 A0A4A8")
## ['A8', 'Z7', 'A2', 'A8']
re.findall('(?:[A-Z]\d)+', "A8 DF Z7 376 A28 A0A4A8")
# ?: in this context tells Python's regex library to not consider the parentheses as defining a capture. (It's still used for grouping)
# normal capture group will hinder ability to get whole match
# non-capturing group to the rescue
## ['A8', 'Z7', 'A2', 'A0A4A8']
list(re.finditer('([A-Z]\d)+', "A8 DF Z7 376 A28 A0A4A8"))
## [<_sre.SRE_Match object; span=(0, 2), match='A8'>, <_sre.SRE_Match object; span=(6, 8), match='Z7'>, <_sre.SRE_Match object; span=(13, 15), match='A2'>, <_sre.SRE_Match object; span=(17, 23), match='A0A4A8'>]
[i.group() for i in re.finditer('[A-Z]\d+', "A8 DF Z7 376 A28 A0A4A8")]
## ['A8', 'Z7', 'A28', 'A0', 'A4', 'A8']
[i.group() for i in re.finditer('([A-Z]\d)+', "A8 DF Z7 376 A28 A0A4A8")]
## ['A8', 'Z7', 'A2', 'A0A4A8']
This matches a previously matched group again
Example | Description |
---|---|
([Pp])ython&\1ails | Match python&pails or Python&Pails |
([’”])[^\\1]*\1 | Single or double-quoted string. \1 matches whatever the 1st group matched. \2 matches whatever the 2nd group matched, etc. |
Alternatives
python|perl : Match “python” or “perl”
rub(y|le)) : Match “ruby” or “ruble”
Python(!+|\?) : Match “Python” followed by one or more ! or one ?
# Python
import re
re.sub(r'\[(\d+)\]', r'\1', '[38] min and [4] sec')
## '38 min and 4 sec'
re.sub(r'\[(\d+)\]', r'\1 long', '[38] min and [4] sec')
## '38 long min and 4 long sec'
re.sub(r'\[(\d+)\]', r'\10', '[38] min and [4] sec')
## invalid group reference 10 at position 1
re.sub(r'\[(\d+)\]', r'\g<1>0', '[38] min and [4] sec')
## '380 min and 40 sec'
Here is the list of all university restaurants in strasbourg, could you create a program that would enable us to compare the menus of open university restaurants in Strasbourg for the next three days ? To do this, please go to the web page on your browser, inspect it by right-clicking, and find the correct HTML class.
https://www.crous-strasbourg.fr/se-restaurer/ou-manger/
Here’s how you can start in Python:
Visit https://www.whatismybrowser.com/detect/what-is-my-user-agent/ and copy your User Agent information (for example: “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36”). Paste this information below into the user_agent dictionary.
Use the function get_page(url_page) to retrieve the HTML content from the webpage.
from bs4 import BeautifulSoup
import time
import random
user_agent = {'User-Agent':'YOUR/USER-AGENT/INFOS/HERE'}
def get_page(urlpage):
# Avoid getting ban
time.sleep(7 + random.uniform(0,8))
# Get the html from the webpage
res = requests.get(urlpage, headers = user_agent)
# Parse the html
soup = BeautifulSoup(res.text, 'html.parser')
return str(soup)
urlpage = 'https://www.crous-strasbourg.fr/restaurant/resto-u-esplanade/'
get_page(urlpage)