Create benchmark and optimize the framework
Overview
The migration from tabulator/tableschema/datapackage/goodtables gave good speed improvement but we still can make it faster especially for working with numbers - https://github.com/frictionlessdata/frictionless-py/issues/461
Tasks
- [ ] create a benchmark we can use to measure and catch regressions
- [ ] improve
read_rowsspeed (as an options rebase from a dict row to a list row) - [ ] improve
date/timevalidating/parsing (we need to try creating date/time objects on demand) - [ ] Investigate validation performance of Excel files with 100000+ rows
Here is a type casting benchmark on Python3.8:
string: 205.771 ms
integer: 192.314 ms
number: 200.992 ms
boolean: 161.194ms
date: 346.873 ms
datetime: 325.867 ms
object: 231.685 ms
We have similar performance for all the tested types except for date/time.
import csv
import datetime
from frictionless import Resource, Schema, validate
N = 10000
# --- string ---
start = datetime.datetime.now()
with Resource([["string"]] * N) as resource:
resource.schema.fields[0].type = "string"
for row in resource.row_stream:
row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"string: {dt * 1e3} ms")
# --- integer ---
start = datetime.datetime.now()
with Resource([["10"]] * N) as resource:
resource.schema.fields[0].type = "integer"
for row in resource.row_stream:
row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"integer: {dt * 1e3} ms")
# --- number ---
start = datetime.datetime.now()
with Resource([["10"]] * N) as resource:
resource.schema.fields[0].type = "number"
for row in resource.row_stream:
row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"number: {dt * 1e3} ms")
# --- boolean ---
start = datetime.datetime.now()
with Resource([["true"]] * N) as resource:
resource.schema.fields[0].type = "boolean"
for row in resource.row_stream:
row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"boolean: {dt * 1e3} ms")
# --- date ---
start = datetime.datetime.now()
with Resource([["2020-01-01"]] * N) as resource:
resource.schema.fields[0].type = "date"
for row in resource.row_stream:
row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"date: {dt * 1e3} ms")
# --- datetime ---
start = datetime.datetime.now()
with Resource([["2020-01-01T14:00:00Z"]] * N) as resource:
resource.schema.fields[0].type = "datetime"
for row in resource.row_stream:
row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"datetime: {dt * 1e3} ms")
# --- object ---
start = datetime.datetime.now()
with Resource([['{"x": 1}']] * N) as resource:
resource.schema.fields[0].type = "object"
for row in resource.row_stream:
row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"object: {dt * 1e3} ms")
Continuing the discussion from https://github.com/frictionlessdata/project/discussions/685:
@roll Your recent improvements sped things up nicely! Now, my benchmark, which used to take 1125 ms on my machine, now runs in less than 200 ms (v. 4.14.0).
frictionless: 183.0 ms frictionless (list): 25.8 ms frictionless (row): 93.5 ms frictionless (row.valid): 140.7 ms pure python: 8.8 ms pandas: 13.7 ms jsonschema: 213.6 ms fastjsonschema: 19.0 ms
import csv
import datetime
import re
import tempfile
import fastjsonschema
import frictionless
import jsonschema
import pandas as pd
# --- Prepare data and metadata ----
N = 10000
MINIMUM = 1
PATTERN = r'[0-9]{4}(-[0-9]{2}(-[0-9]{2})?)?'
CSV = False
# Build data
sample = [
{ 'x': 1, 'y': '2001-01-01' }
]
data = sample * N
if CSV:
f = tempfile.NamedTemporaryFile(delete=True, suffix='.csv')
pd.DataFrame(data).to_csv(f, index=False)
# Build Resource with Table Schema
resource_schema = {
**({'path': f.name} if CSV else {'data': data}),
'schema': {
'fields': [
{
'name': 'x',
'type': 'integer',
'constraints': {
'minimum': MINIMUM
}
},
{
'name': 'y',
'type': 'string',
'constraints': {
'pattern': PATTERN
}
}
]
}
}
# Build equivalent JSON Schema
json_schema = {
'type': 'array',
'items': {
'type': 'object',
'properties': {
'x': {'type': 'integer', 'minimum': MINIMUM},
'y': {'type': 'string', 'pattern': PATTERN},
},
'required': ['x', 'y'],
'additionalProperties': False
}
}
# --- frictionless ---
resource = frictionless.Resource(resource_schema, trusted=True)
report = frictionless.validate(resource, limit_errors=float('inf'))
print(f'frictionless: {report.time * 1e3} ms')
# --- list ---
start = datetime.datetime.now()
with frictionless.Resource(resource_schema, trusted=True) as resource:
for l in resource.list_stream:
pass
dt = (datetime.datetime.now() - start).total_seconds()
print(f"frictionless (list): {dt * 1e3} ms")
# --- row ---
start = datetime.datetime.now()
with frictionless.Resource(resource_schema, trusted=True) as resource:
for row in resource.row_stream:
pass
dt = (datetime.datetime.now() - start).total_seconds()
print(f"frictionless (row): {dt * 1e3} ms")
# --- row.valid ---
start = datetime.datetime.now()
with frictionless.Resource(resource_schema, trusted=True) as resource:
for row in resource.row_stream:
row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"frictionless (row.valid): {dt * 1e3} ms")
# --- pure python ---
regex = re.compile(PATTERN)
start = datetime.datetime.now()
if CSV:
reader = csv.DictReader(open(f.name))
rows = [{'x': int(row['x']), 'y': row['y']} for row in reader]
else:
rows = data
valid = [
{
'x': isinstance(row['x'], int) and row['x'] >= MINIMUM,
'y': isinstance(row['y'], str) and regex.fullmatch(row['y']) is not None
}
for row in rows
]
dt = (datetime.datetime.now() - start).total_seconds()
print(f'pure python: {dt * 1e3} ms')
# --- pandas ---
start = datetime.datetime.now()
df = pd.read_csv(f.name) if CSV else pd.DataFrame(data)
valid = {
'x': df['x'].ge(MINIMUM),
'y': df['y'].str.fullmatch(PATTERN)
}
dt = (datetime.datetime.now() - start).total_seconds()
print(f'pandas: {dt * 1e3} ms')
# ---- JSON Schema ----
if not CSV:
# --- jsonschema ---
validator = jsonschema.Draft7Validator(json_schema)
start = datetime.datetime.now()
errors = list(validator.iter_errors(data))
dt = (datetime.datetime.now() - start).total_seconds()
print(f'jsonschema: {dt * 1e3} ms')
# --- fastjsonschema ---
validator = fastjsonschema.compile(json_schema)
start = datetime.datetime.now()
validator(data)
dt = (datetime.datetime.now() - start).total_seconds()
print(f'fastjsonschema: {dt * 1e3} ms')