Overview

The migration from tabulator/tableschema/datapackage/goodtables gave good speed improvement but we still can make it faster especially for working with numbers - https://github.com/frictionlessdata/frictionless-py/issues/461

Tasks

[ ] create a benchmark we can use to measure and catch regressions
[ ] improve read_rows speed (as an options rebase from a dict row to a list row)
[ ] improve date/time validating/parsing (we need to try creating date/time objects on demand)
[ ] Investigate validation performance of Excel files with 100000+ rows

Dec 03 '20 09:12 roll

Here is a type casting benchmark on Python3.8:

string: 205.771 ms
integer: 192.314 ms
number: 200.992 ms
boolean: 161.194ms
date: 346.873 ms
datetime: 325.867 ms
object: 231.685 ms

We have similar performance for all the tested types except for date/time.

import csv
import datetime
from frictionless import Resource, Schema, validate

N = 10000

# --- string ---

start = datetime.datetime.now()
with Resource([["string"]] * N) as resource:
    resource.schema.fields[0].type = "string"
    for row in resource.row_stream:
        row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"string: {dt * 1e3} ms")

# --- integer ---

start = datetime.datetime.now()
with Resource([["10"]] * N) as resource:
    resource.schema.fields[0].type = "integer"
    for row in resource.row_stream:
        row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"integer: {dt * 1e3} ms")

# --- number ---

start = datetime.datetime.now()
with Resource([["10"]] * N) as resource:
    resource.schema.fields[0].type = "number"
    for row in resource.row_stream:
        row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"number: {dt * 1e3} ms")

# --- boolean ---

start = datetime.datetime.now()
with Resource([["true"]] * N) as resource:
    resource.schema.fields[0].type = "boolean"
    for row in resource.row_stream:
        row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"boolean: {dt * 1e3} ms")

# --- date ---

start = datetime.datetime.now()
with Resource([["2020-01-01"]] * N) as resource:
    resource.schema.fields[0].type = "date"
    for row in resource.row_stream:
        row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"date: {dt * 1e3} ms")

# --- datetime ---

start = datetime.datetime.now()
with Resource([["2020-01-01T14:00:00Z"]] * N) as resource:
    resource.schema.fields[0].type = "datetime"
    for row in resource.row_stream:
        row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"datetime: {dt * 1e3} ms")

# --- object ---

start = datetime.datetime.now()
with Resource([['{"x": 1}']] * N) as resource:
    resource.schema.fields[0].type = "object"
    for row in resource.row_stream:
        row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"object: {dt * 1e3} ms")

Jul 14 '21 13:07 roll

Continuing the discussion from https://github.com/frictionlessdata/project/discussions/685:

@roll Your recent improvements sped things up nicely! Now, my benchmark, which used to take 1125 ms on my machine, now runs in less than 200 ms (v. 4.14.0).

frictionless: 183.0 ms frictionless (list): 25.8 ms frictionless (row): 93.5 ms frictionless (row.valid): 140.7 ms pure python: 8.8 ms pandas: 13.7 ms jsonschema: 213.6 ms fastjsonschema: 19.0 ms

import csv
import datetime
import re
import tempfile

import fastjsonschema
import frictionless
import jsonschema
import pandas as pd

# --- Prepare data and metadata ----
N = 10000
MINIMUM = 1
PATTERN = r'[0-9]{4}(-[0-9]{2}(-[0-9]{2})?)?'
CSV = False

# Build data
sample = [
  { 'x': 1, 'y': '2001-01-01' }
]
data = sample * N
if CSV:
  f = tempfile.NamedTemporaryFile(delete=True, suffix='.csv')
  pd.DataFrame(data).to_csv(f, index=False)

# Build Resource with Table Schema
resource_schema = {
  **({'path': f.name} if CSV else {'data': data}),
  'schema': {
    'fields': [
      {
        'name': 'x',
        'type': 'integer',
        'constraints': {
          'minimum': MINIMUM
        }
      },
      {
        'name': 'y',
        'type': 'string',
        'constraints': {
          'pattern': PATTERN
        }
      }
    ]
  }
}

# Build equivalent JSON Schema
json_schema = {
  'type': 'array',
  'items': {
    'type': 'object',
      'properties': {
        'x': {'type': 'integer', 'minimum': MINIMUM},
        'y': {'type': 'string', 'pattern': PATTERN},
      },
      'required': ['x', 'y'],
      'additionalProperties': False
  }
}

# --- frictionless ---

resource = frictionless.Resource(resource_schema, trusted=True)
report = frictionless.validate(resource, limit_errors=float('inf'))
print(f'frictionless: {report.time * 1e3} ms')

# --- list ---

start = datetime.datetime.now()
with frictionless.Resource(resource_schema, trusted=True) as resource:
  for l in resource.list_stream:
      pass
dt = (datetime.datetime.now() - start).total_seconds()
print(f"frictionless (list): {dt * 1e3} ms")

# --- row ---

start = datetime.datetime.now()
with frictionless.Resource(resource_schema, trusted=True) as resource:
  for row in resource.row_stream:
    pass
dt = (datetime.datetime.now() - start).total_seconds()
print(f"frictionless (row): {dt * 1e3} ms")

# --- row.valid ---

start = datetime.datetime.now()
with frictionless.Resource(resource_schema, trusted=True) as resource:
  for row in resource.row_stream:
    row.valid
dt = (datetime.datetime.now() - start).total_seconds()
print(f"frictionless (row.valid): {dt * 1e3} ms")

# --- pure python ---

regex = re.compile(PATTERN)
start = datetime.datetime.now()
if CSV:
  reader = csv.DictReader(open(f.name))
  rows = [{'x': int(row['x']), 'y': row['y']} for row in reader]
else:
  rows = data
valid = [
  {
    'x': isinstance(row['x'], int) and row['x'] >= MINIMUM,
    'y': isinstance(row['y'], str) and regex.fullmatch(row['y']) is not None
  }
  for row in rows
]
dt = (datetime.datetime.now() - start).total_seconds()
print(f'pure python: {dt * 1e3} ms')

# --- pandas ---

start = datetime.datetime.now()
df = pd.read_csv(f.name) if CSV else pd.DataFrame(data)
valid = {
  'x': df['x'].ge(MINIMUM),
  'y': df['y'].str.fullmatch(PATTERN)
}
dt = (datetime.datetime.now() - start).total_seconds()
print(f'pandas: {dt * 1e3} ms')

# ---- JSON Schema ----

if not CSV:

  # --- jsonschema ---

  validator = jsonschema.Draft7Validator(json_schema)
  start = datetime.datetime.now()
  errors = list(validator.iter_errors(data))
  dt = (datetime.datetime.now() - start).total_seconds()
  print(f'jsonschema: {dt * 1e3} ms')

  # --- fastjsonschema ---

  validator = fastjsonschema.compile(json_schema)
  start = datetime.datetime.now()
  validator(data)
  dt = (datetime.datetime.now() - start).total_seconds()
  print(f'fastjsonschema: {dt * 1e3} ms')

Jul 19 '21 13:07 ezwelty

Create benchmark and optimize the framework

Overview

Tasks