Speed comparison - V1 vs V2¶
Comparison made on machine with the following specs:
Operating System: Windows 10 Pro 64-bit (10.0, Build 19042) (19041.vb_release.191206-1406)
Processor: Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz (12 CPUs), ~2.6GHz
Memory: 16384MB RAM
Key components in environment:
Python: 3.9.0
Pandas: 1.3.0
Numpy: 1.21.0
Timeit setup¶
[1]:
import numpy as np
import timeit
[2]:
setup = """
import staircase_one as sc1
import staircase as sc2
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
def make_data(use_dates, intervals=4000):
starts = np.random.randint(-24*60, 24*365*60, intervals)
ends = starts + np.round(np.random.triangular(2*60, 8*60, 24*60, intervals))
values = np.random.randint(1, 20, intervals)
df = pd.DataFrame({"start":starts, "end":ends, "value":values}).query("end > 0")
df.loc[df["start"] < 0, "start"] = np.nan
df.loc[df["end"] > 365*24*60, "end"] = np.nan
if use_dates:
df["start"] = pd.to_datetime(df["start"], unit="m", origin=pd.Timestamp("2021"))
df["end"] = pd.to_datetime(df["end"], unit="m", origin=pd.Timestamp("2021"))
return df.sort_values('end', na_position='last').sort_values('start', na_position='first')
"""
[3]:
create_stairs = """
use_dates = False
dfA = make_data(use_dates)
dfB = make_data(use_dates)
s1A = sc1.Stairs(use_dates=use_dates).layer(dfA['start'], dfA['end'])
s1B = sc1.Stairs(use_dates=use_dates).layer(dfB['start'], dfB['end'])
s2A = sc2.Stairs().layer(dfA['start'], dfA['end'])
s2B = sc2.Stairs().layer(dfB['start'], dfB['end'])
"""
[4]:
from IPython.display import display, Markdown
def compare(stmt1, stmt2, repeat, number, extra_setup):
def time(stmt):
return np.divide(
timeit.repeat(
stmt,
setup=setup+extra_setup,
repeat=repeat,
number=number,
),
repeat
).min()
time1 = time(stmt1)
time2 = time(stmt2)
ratio = time1/time2
if ratio < 1:
speedup = f"{time1/time2:.2f}"
else:
speedup = str(float(f"{time1/time2:.2g}")).removesuffix(".0")
display(Markdown(f"## Speed up ~ {speedup}x"))
Creation + layering¶
[5]:
stmt1 = 'sc1.Stairs(use_dates=False).layer(dfA["start"], dfA["end"], dfA["value"])'
stmt2 = 'sc2.Stairs(dfA, start="start", end="end", value="value")'
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 3.1x¶
[6]:
stmt1 = 'sc1.Stairs(use_dates=False).layer(dfA["start"], dfA["end"], dfA["value"]).layer(dfB["start"], dfB["end"], dfB["value"])'
stmt2 = 'sc2.Stairs(dfA, start="start", end="end", value="value").layer(start="start", end="end", value="value", frame=dfB)'
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 2.8x¶
Descriptive statistics¶
[7]:
stmt1 = "s1A.max()"
stmt2 = "s2A.max()"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 33x¶
[8]:
stmt1 = "s1A.min()"
stmt2 = "s2A.min()"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 33x¶
[9]:
stmt1 = "s1A.integrate()"
stmt2 = "s2A.integral()"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 6.4x¶
[10]:
stmt1 = "s1A.mean()"
stmt2 = "s2A.mean()"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 6.3x¶
[11]:
stmt1 = "s1A.median()"
stmt2 = "s2A.median()"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 4.9x¶
[12]:
stmt1 = "s1A.percentile(20)"
stmt2 = "s2A.percentile(20)"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 4.9x¶
[13]:
stmt1 = "s1A.percentile(0.2*100)"
stmt2 = "s2A.fractile(0.2)"
compare(stmt1, stmt2, 900, 1, create_stairs)
Speed up ~ 4.9x¶
[14]:
stmt1 = "s1A.std()"
stmt2 = "s2A.std()"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 3.6x¶
[15]:
stmt1 = "s1A.var()"
stmt2 = "s2A.var()"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 3.5x¶
[16]:
stmt1 = "s1A.cov(s1B)"
stmt2 = "s2A.cov(s2B)"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 11x¶
[17]:
stmt1 = "s1A.corr(s1B)"
stmt2 = "s2A.corr(s2B)"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 6.1x¶
Arithmetic¶
[18]:
stmt1 = "s1A + s1B"
stmt2 = "s2A + s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 7.7x¶
[19]:
stmt1 = "s1A - s1B"
stmt2 = "s2A - s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 7.8x¶
[20]:
stmt1 = "s1A * s1B"
stmt2 = "s2A * s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 14x¶
[21]:
stmt1 = "s1A / (s1B+0.0000001)"
stmt2 = "s2A / s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 23x¶
Relational¶
[22]:
stmt1 = "s1A < s1B"
stmt2 = "s2A < s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 14x¶
[23]:
stmt1 = "s1A <= s1B"
stmt2 = "s2A <= s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 14x¶
[24]:
stmt1 = "s1A > s1B"
stmt2 = "s2A > s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 14x¶
[25]:
stmt1 = "s1A >= s1B"
stmt2 = "s2A >= s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 14x¶
[26]:
stmt1 = "s1A == s1B"
stmt2 = "s2A == s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 14x¶
[27]:
stmt1 = "s1A != s1B"
stmt2 = "s2A != s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 14x¶
Logical¶
[28]:
stmt1 = "s1A.make_boolean()"
stmt2 = "s2A.make_boolean()"
compare(stmt1, stmt2, 900, 1, create_stairs)
Speed up ~ 16x¶
[29]:
stmt1 = "~s1A"
stmt2 = "~s2A"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 16x¶
[30]:
stmt1 = "s1A & s1B"
stmt2 = "s2A & s2B"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 16x¶
[31]:
stmt1 = "s1A | s1B"
stmt2 = "s2A | s2B"
compare(stmt1, stmt2, 900, 1, create_stairs)
Speed up ~ 16x¶
Distribution¶
[32]:
stmt1 = "s1A.ecdf_stairs()"
stmt2 = "s2A.ecdf"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 7.2x¶
[33]:
stmt1 = "s1A.percentile_stairs()"
stmt2 = "s2A.percentile"
compare(stmt1, stmt2, 1000, 1, create_stairs)
Speed up ~ 5.1x¶
[34]:
stmt1 = "s1A.hist(bin_edges=bin_edges)"
stmt2 = "s2A.hist(bins=bins)"
hist_setup = """
low, high = s2A.min(), s2A.max()+0.0001
bins = pd.interval_range(low, high, 10, closed='left')
bin_edges=np.linspace(low, high, 11)
"""
compare(stmt1, stmt2, 1000, 1, create_stairs+hist_setup)
Speed up ~ 8.2x¶
Array methods¶
[35]:
create_arrays = """
use_dates=False
dfs = [make_data(use_dates) for i in range(10)]
s1array = [sc1.Stairs(use_dates=use_dates).layer(df['start'], df['end'], df['value']) for df in dfs]
s2array = [sc2.Stairs().layer(df['start'], df['end'], df['value']) for df in dfs]
"""
[36]:
stmt1 = "sc1.mean(s1array)"
stmt2 = "sc2.mean(s2array)"
compare(stmt1, stmt2, 100, 1, create_arrays)
Speed up ~ 73x¶
[37]:
stmt1 = "sc1.median(s1array)"
stmt2 = "sc2.median(s2array)"
compare(stmt1, stmt2, 100, 1, create_arrays)
Speed up ~ 65x¶
[38]:
stmt1 = "sc1.min(s1array)"
stmt2 = "sc2.min(s2array)"
compare(stmt1, stmt2, 100, 1, create_arrays)
Speed up ~ 75x¶
[39]:
stmt1 = "sc1.max(s1array)"
stmt2 = "sc2.max(s2array)"
compare(stmt1, stmt2, 100, 1, create_arrays)
Speed up ~ 75x¶
[40]:
stmt1 = "sc1.sample(s1array, x)"
stmt2 = "sc2.sample(s2array, x)"
sample_setup = """
x = np.linspace(0,24*365,366)
s1array = pd.Series(s1array)
s2array = pd.Series(s2array)
"""
compare(stmt1, stmt2, 100, 1, create_arrays+sample_setup)