Speed comparison - V1 vs V2 (using dates)#

Comparison made on machine with the following specs:

Operating System: Windows 10 Pro 64-bit (10.0, Build 19042) (19041.vb_release.191206-1406)
       Processor: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz (12 CPUs), ~2.6GHz
          Memory: 16384MB RAM

Key components in environment:

Python: 3.9.0
Pandas: 1.3.0
 Numpy: 1.21.0

Timeit setup#

[1]:
import numpy as np
import timeit
[2]:
setup = """
import staircase_one as sc1
import staircase as sc2
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def make_data(use_dates, intervals=4000):
    starts = np.random.randint(-24*60, 24*365*60, intervals)
    ends = starts + np.round(np.random.triangular(2*60, 8*60, 24*60, intervals))
    values = np.random.randint(1, 20, intervals)

    df = pd.DataFrame({"start":starts, "end":ends, "value":values}).query("end > 0")
    df.loc[df["start"] < 0, "start"] = np.nan
    df.loc[df["end"] > 365*24*60, "end"] = np.nan

    if use_dates:
        df["start"] = pd.to_datetime(df["start"], unit="m", origin=pd.Timestamp("2021"))
        df["end"] = pd.to_datetime(df["end"], unit="m", origin=pd.Timestamp("2021"))

    return df.sort_values('end', na_position='last').sort_values('start', na_position='first')

"""

[3]:
create_stairs = """
use_dates = True

dfA = make_data(use_dates)
dfB = make_data(use_dates)

s1A = sc1.Stairs(use_dates=use_dates).layer(dfA['start'], dfA['end'])
s1B = sc1.Stairs(use_dates=use_dates).layer(dfB['start'], dfB['end'])

s2A = sc2.Stairs().layer(dfA['start'], dfA['end'])
s2B = sc2.Stairs().layer(dfB['start'], dfB['end'])
"""

[4]:
from IPython.display import display, Markdown

def compare(stmt1, stmt2, repeat, number, extra_setup):

    def time(stmt):
        return np.divide(
            timeit.repeat(
                stmt,
                setup=setup+extra_setup,
                repeat=repeat,
                number=number,
            ),
            repeat
        ).min()

    time1 = time(stmt1)
    time2 = time(stmt2)
    ratio = time1/time2
    if ratio < 1:
        speedup = f"{time1/time2:.2f}"
    else:
        speedup = str(float(f"{time1/time2:.2g}")).removesuffix(".0")
    display(Markdown(f"## Speed up ~ {speedup}x"))

Creation + layering#

[5]:
stmt1 = 'sc1.Stairs(use_dates=True).layer(dfA["start"], dfA["end"], dfA["value"])'
stmt2 = 'sc2.Stairs(dfA, start="start", end="end", value="value")'
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 3.4x#

[6]:
stmt1 = 'sc1.Stairs(use_dates=True).layer(dfA["start"], dfA["end"], dfA["value"]).layer(dfB["start"], dfB["end"], dfB["value"])'
stmt2 = 'sc2.Stairs(dfA, start="start", end="end", value="value").layer("start", "end", "value", frame = dfB)'
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 3.1x#

Descriptive statistics#

[7]:
stmt1 = "s1A.max()"
stmt2 = "s2A.max()"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 29x#

[8]:
stmt1 = "s1A.min()"
stmt2 = "s2A.min()"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 30x#

[9]:
stmt1 = "s1A.integrate()"
stmt2 = "s2A.integral()"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 5.1x#

[10]:
stmt1 = "s1A.mean()"
stmt2 = "s2A.mean()"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 5.1x#

[11]:
stmt1 = "s1A.median()"
stmt2 = "s2A.median()"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 13x#

[12]:
stmt1 = "s1A.percentile(20)"
stmt2 = "s2A.percentile(20)"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 13x#

[13]:
stmt1 = "s1A.percentile(0.2*100)"
stmt2 = "s2A.fractile(0.2)"
compare(stmt1, stmt2, 900, 1, create_stairs)

Speed up ~ 13x#

[14]:
stmt1 = "s1A.std()"
stmt2 = "s2A.std()"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 7.7x#

[15]:
stmt1 = "s1A.var()"
stmt2 = "s2A.var()"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 7.8x#

[16]:
stmt1 = "s1A.cov(s1B)"
stmt2 = "s2A.cov(s2B)"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 9.1x#

[17]:
stmt1 = "s1A.corr(s1B)"
stmt2 = "s2A.corr(s2B)"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 7.8x#

Arithmetic#

[18]:
stmt1 = "s1A + s1B"
stmt2 = "s2A + s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 8x#

[19]:
stmt1 = "s1A - s1B"
stmt2 = "s2A - s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 8x#

[20]:
stmt1 = "s1A * s1B"
stmt2 = "s2A * s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 11x#

[21]:
stmt1 = "s1A / (s1B+0.0000001)"
stmt2 = "s2A / s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 19x#

Relational#

[22]:
stmt1 = "s1A < s1B"
stmt2 = "s2A < s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 12x#

[23]:
stmt1 = "s1A <= s1B"
stmt2 = "s2A <= s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 12x#

[24]:
stmt1 = "s1A > s1B"
stmt2 = "s2A > s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 12x#

[25]:
stmt1 = "s1A >= s1B"
stmt2 = "s2A >= s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 12x#

[26]:
stmt1 = "s1A == s1B"
stmt2 = "s2A == s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 12x#

[27]:
stmt1 = "s1A != s1B"
stmt2 = "s2A != s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 12x#

Logical#

[28]:
stmt1 = "s1A.make_boolean()"
stmt2 = "s2A.make_boolean()"
compare(stmt1, stmt2, 900, 1, create_stairs)

Speed up ~ 15x#

[29]:
stmt1 = "~s1A"
stmt2 = "~s2A"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 16x#

[30]:
stmt1 = "s1A & s1B"
stmt2 = "s2A & s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 14x#

[31]:
stmt1 = "s1A | s1B"
stmt2 = "s2A | s2B"
compare(stmt1, stmt2, 900, 1, create_stairs)

Speed up ~ 13x#

Distribution#

[32]:
stmt1 = "s1A.ecdf_stairs()"
stmt2 = "s2A.ecdf"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 19x#

[33]:
stmt1 = "s1A.percentile_stairs()"
stmt2 = "s2A.percentile"
compare(stmt1, stmt2, 1000, 1, create_stairs)

Speed up ~ 14x#

[34]:
stmt1 = "s1A.hist(bin_edges=bin_edges)"
stmt2 = "s2A.hist(bins=bins)"

hist_setup = """
low, high = s2A.min(), s2A.max()+0.0001
bins = pd.interval_range(low, high, 10, closed='left')
bin_edges=np.linspace(low, high, 11)
"""

compare(stmt1, stmt2, 1000, 1, create_stairs+hist_setup)

Speed up ~ 22x#

Array methods#

[35]:
create_arrays = """
use_dates=True
dfs = [make_data(use_dates) for i in range(10)]
s1array = [sc1.Stairs(use_dates=use_dates).layer(df['start'], df['end'], df['value']) for df in dfs]
s2array = [sc2.Stairs().layer(df['start'], df['end'], df['value']) for df in dfs]
"""

[36]:
stmt1 = "sc1.mean(s1array)"
stmt2 = "sc2.mean(s2array)"
compare(stmt1, stmt2, 100, 1, create_arrays)

Speed up ~ 36x#

[37]:
stmt1 = "sc1.median(s1array)"
stmt2 = "sc2.median(s2array)"
compare(stmt1, stmt2, 100, 1, create_arrays)

Speed up ~ 35x#

[38]:
stmt1 = "sc1.min(s1array)"
stmt2 = "sc2.min(s2array)"
compare(stmt1, stmt2, 100, 1, create_arrays)

Speed up ~ 37x#

[39]:
stmt1 = "sc1.max(s1array)"
stmt2 = "sc2.max(s2array)"
compare(stmt1, stmt2, 100, 1, create_arrays)

Speed up ~ 36x#

[40]:
stmt1 = "sc1.sample(s1array, x)"
stmt2 = "sc2.sample(s2array, x)"

sample_setup = """
x = pd.date_range('2021', '2022')
s1array = pd.Series(s1array)
s2array = pd.Series(s2array)
"""

compare(stmt1, stmt2, 100, 1, create_arrays+sample_setup)

Speed up ~ 8.6x#