Skip to main content
Python

Python #

(Maybe) Useful resources #

Tutorial/snippets/examples:

Books/courses:

File handling #

CSV #

This section focuses on the Python module csv. For the package pandas, see Pandas.

Docs: csv — CSV File Reading and Writing — Python 3.11.4 documentation

Import as list: (Ref)

import csv

with open("data.csv", newline="") as f:
    reader = csv.reader(f)
    data = list(reader)

print(data)
# [
#   ["first", "row"], ["second", "row"], …
# ]

JSON #

Split a big JSON file into smaller files: (credit)

import os
import json
from itertools import islice

def split_json(
    data_path,
    file_name,
    size_split=1000,
):
    """Split a big JSON file into chunks.
    file_name : exclude ".json"
    """
    with open(os.path.join(data_path, file_name + ".json"), "r") as f:
        whole_file = json.load(f)

    split = len(whole_file) // size_split

    for i in range(split + 1):
        with open(os.path.join(data_path, file_name + "_"+ str(split+1) + "_" + str(i+1) + ".json"), 'w') as f:
            json.dump(dict(islice(whole_file.items(), i*size_split, (i+1)*size_split)), f)
    return

Merge said smaller files into one file: (my answer)

json_all = dict()
split = 4         # this is the 1-based actual number of splits

for i in range(1, split+1):
    with open(os.path.join("data_folder", "data_file_" + str(split) + "_" + str(i) + ".json"), 'r') as f:
        json_i = json.load(f)
        json_all.update(json_i)

Objects #

Dictionary #

Append #

new_dict = {}
new_dict.update({"name": number})

Get head of a dictionary: (credit)

from itertools import islice

dict(islice(my_dict.items(), 0, 5))

Loop #

# key only
for key in my_dict:
    print(key)

# value only
for value in my_dict.values():
    print(value)

# key-value
for key, value in my_dict.items():
    print(key, value)

Loop in reverse order: (ref)

for key, value in sorted(my_dict.items(), key=lambda x: x[0], reverse=True):
    print(key, value)

Keys:

if key in my_dict:
    # ...

List #

Doc: 5. Data Structures — Python 3.12.2 documentation

Find index of #

Find the index of minimum in a list: (credit)

val, index = min((val, index) for (index, val) in enumerate(my_list))

Find n-th element #

# first
my_list[0]

# last
my_list[-1]

# middle
# https://stackoverflow.com/questions/38130895/find-middle-of-a-list#comment105177234_38131003
my_list[int(len(my_list)//2)]

Export to file #

Write a list to file: (credit)

import os

my_list = ["A", "B", "C"]

# open file in write mode
with open(os.path.join("data", "log.txt"), 'w') as f:
    for item in my_list:
        # write each item on a new line
        f.write("%s\n" % item)

print("Done.")

Sort #

my_list = ["B", "C", "D"]
my_list.sort()
print(my_list)
# ['B', 'C', 'D']

Remove empty elements #

Ref: python - Remove empty strings from a list of strings - Stack Overflow

my_list = ["A", "", "B", ""]
my_list = list(filter(None, my_list))
print(my_list)
# ['A', 'B']

Remove duplicated elements #

Ref: python - Removing duplicates in lists - Stack Overflow

my_list = ["A", "A", "B", "C"]
my_list = list(set(my_list))
my_list.sort()
print(my_list)
# ['A', 'B', 'C']

Remove elements from another list #

Ref: python - How do I subtract one list from another? - Stack Overflow

x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
y = [1, 3, 5, 7, 9]

[item for item in x if item not in y]
# [0, 2, 4, 6, 8]

Pickle #

See Writing functions #Dark magic for the wrapper function version.

import os
import pickle

pickle_path = os.path.join("data", "data.pickle") # path: ./data/data.pickle

# create pickle
with open(pickle_path, "wb") as f:
    pickle.dump(df, f)

# use pickle
if os.path.isfile(pickle_path):
    with open(pickle_path, "rb") as f:
        df = pickle.load(f)
else:
    print("Pickle not found. Please create one first.")

String #

Add leading zeros #

Ref: python - Best way to format integer as string with leading zeros? - Stack Overflow

year = [2015]
month = list(range(1, 13))

for y in year:
    for m in month:
        print(str(y) + "_" + str(m).zfill(2))

This gives the exact same output as below, but exposes year and month variables to tinker with: (ref)

import pandas as pd

pr = pd.period_range(start='2015-01',end='2015-12', freq='M')
for period in pr:
    print(period)

functools #

Doc: functools — Higher-order functions and operations on callable objects — Python 3.12.2 documentation

partial #

Ref: python - How does functools partial do what it does? - Stack Overflow

itertools #

Doc: itertools — Functions creating iterators for efficient looping — Python 3.12.2 documentation

combinations #

Input: list p, integer r

Output: all r-combinations of items in p, no repetition, no orders.

from itertools import combinations

list(combinations('ABCD', 2))
# [('A', 'B'), ('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D'), ('C', 'D')]
list(combinations(range(4), 3))
# [(0, 1, 2), (0, 1, 3), (0, 2, 3), (1, 2, 3)]

product #

Input: list p, list q

Output: Cartesian product between p and q.

Example: List months in a year

import itertools

year = [2015]
month = list(range(1, 13))

for period in itertools.product(year, month):
    print(period)

Multiple iteration values #

Get previous and next iteration value: (Ref, adopted for Python 3)

from itertools import tee, islice, chain

def p_i_n(some_iterable):
    prevs, items, nexts = tee(some_iterable, 3)
    prevs = chain([None], prevs)
    nexts = chain(islice(nexts, 1, None), [None])
    return zip(prevs, items, nexts)

for p,i,n in p_i_n(my_list):
    print(p)
    print(i)
    print(n)
    break

for p,i,n in p_i_n(my_dict.items()):
    key, value = i
    print(key)
    print(value)
    break

Print #

Refs:

Bold #

Ref: How can I print bold text in Python? - Stack Overflow

print("\033[1m" + "hello world" + "\033[0m")

Table #

Ref: python - Printing Lists as Tabular Data - Stack Overflow

TBA.

JSON/Dictionary #

Pretty print: (credit)

json.dumps() does not work if there are keys with type int32.

import json

print(json.dumps(json.loads(json_data), indent=4))

# if get TypeError: the JSON object must be str, bytes or bytearray, not dict
# then use:
print(json.dumps(json_data, indent=4))

Statements #

break, pass, continue #

Ref: How To Use Break, Continue, and Pass Statements when Working with Loops in Python 3 | DigitalOcean

break: stop the whole loop.

for i in list(range(1, 11)):
    print(i)
    break

# Output: 1

continue: skip everything else in the current iteration, and go to the next iteration.

for file in files_list:
    # we only want to download the files
    if os.path.isfile(file): # file already exists
        continue
    # download

pass: skip the current if block, do everything else that is in the current iteration.

for file in files_list:
    # we want to download the file and further process it
    if os.path.isfile(file): # file already exists
        pass
    else:
        # download
    # other things to do with the file

Loop #

Multiple iteration:

zip only goes through the shortest list given.

lower = [a, b, c]
upper = [A, B, C]

for l, u in zip(lower, upper):
    print(f"{l} = {u}")

Stop after a certain time: (ref)

import time

start_time = time.time()
time_to_run = 60*5          # 5 minutes
end_time = start_time + time_to_run

for i in range(0, 100000):
    if (time.time() >= end_time):
        break

    k = k + i

Match case (Switch) #

This is new feature in Python 3.10.

Regex match I use to import some badly headered datasets: (ref)

import re

class RegexEqual(str):
    def __eq__(self, pattern):
        return bool(re.match(pattern, self))

def usecols_fn(col):
    match RegexEqual(col):
        case "ID":
            return True
        case "[A-z\s]*Date":
            return True
        case "[A-z\s]*Rating":
            return True
    return False

df = pd.read_excel(
    "data.xlsx",
    sheet_name="Sheet 1",
    usecols=usecols_fn,
    index_col="ID",
)

Useful tricks #

Docstring/Help #

Like R’s ? function.

help(np.log)

# If it is a method, need something that the method can run from:
help(df.reset_index)

lambda functions #

View all attributes/methods #

Print all attributes (& methods) of an object: (credit)

for attr in dir(object_a):
    if not attr.startswith('_'):
        print(attr)

Print all methods of an object: (credit)

for method in dir(object_a):
    if callable(getattr(object_a, method)) and (not method.startswith('_')):
        print(method)

Ternary conditional operator #

Ref: Does Python have a ternary conditional operator? - Stack Overflow