I've been compressing datasets with zstd and using them with streaming decompression to save space, reduce SSD wear and speed up access to compressed data. It's also useful for previewing datasets saved as zst as they download. I couldn't find anything readily available on the net on how to do it so hopefully this saves someone else some time:
# installation: python -m pip install zstandard ijson
import zstandard as zstd
import ijson
# streaming decompression for JSON files compressed with zstd --long=31
with open("laion_filtered.json.zst", "rb") as f:
dctx = zstd.ZstdDecompressor(max_window_size=2147483648) # max_window_size required for --long=31
with dctx.stream_reader(f) as reader:
for record in ijson.items(reader, "item"):
print(record)
import io
import json
# streaming decompression for NDJSON/JSONL files compressed with zstd --long=31
with open("00.jsonl.zst", "rb") as f:
dctx = zstd.ZstdDecompressor(max_window_size=2147483648) # max_window_size required for --long=31
with dctx.stream_reader(f) as reader:
for line in io.BufferedReader(reader):
record = json.loads(line)
print(record)
import csv
# streaming decompression for TSV files compressed with zstd --long=31
with open("Image_Labels_Subset_Train_GCC-Labels-training.tsv.zst", "rb") as f:
dctx = zstd.ZstdDecompressor(max_window_size=2147483648) # max_window_size required for --long=31
with dctx.stream_reader(f) as reader:
buffered_reader = io.BufferedReader(reader)
wrapper = io.TextIOWrapper(reader)
csv_reader = csv.reader(wrapper, delimiter='\t') # or csv.DictReader if it has fieldnames
for record in csv_reader:
print(record)
Also I recommend building zstd from source since the latest version has improved performance and compression.