python - 矢量化查找另一行 + 计算字段
问题描述
我有这个数据框“dfSummary”-
exchangeBalances = [['ETHBTC','binance',10], ['LTCBTC','binance',10], ['XRPBTC','binance',10], ['ETHBTC','bitfinex',10], ['LTCBTC','bitfinex',10], ['XRPBTC','bitfinex',10]]
bidOffers = [
['ETHBTC','binance', 0.0035, 0.0351, datetime(2018, 9, 1, 8, 15)], ['LTCBTC','binance',0.009,0.092, datetime(2018, 9, 1, 8, 15)], ['XRPBTC','binance',0.000077, 0.000078, datetime(2018, 9, 1, 8, 15)], ['ETHBTC','bitfinex', 0.003522, 0.0353, datetime(2018, 9, 1, 8, 15)], ['LTCBTC','bitfinex',0.0093,0.095, datetime(2018, 9, 1, 8, 15)], ['XRPBTC','bitfinex',0.000083, 0.000085, datetime(2018, 9, 1, 8, 15)],
['ETHBTC','binance', 0.0035, 0.0351, datetime(2018, 9, 1, 8, 30)], ['LTCBTC','binance',0.009,0.092, datetime(2018, 9, 1, 8, 30)], ['XRPBTC','binance',0.000077, 0.000078, datetime(2018, 9, 1, 8, 30)], ['ETHBTC','bitfinex', 0.003522, 0.0353, datetime(2018, 9, 1, 8, 30)], ['LTCBTC','bitfinex',0.0093,0.095, datetime(2018, 9, 1, 8, 30)], ['XRPBTC','bitfinex',0.000083, 0.000085, datetime(2018, 9, 1, 8, 30)],
['ETHBTC','binance', 0.0035, 0.0351, datetime(2018, 9, 1, 8, 45)], ['LTCBTC','binance',0.009,0.092, datetime(2018, 9, 1, 8, 45)], ['XRPBTC','binance',0.000077, 0.000078, datetime(2018, 9, 1, 8, 45)], ['ETHBTC','bitfinex', 0.003522, 0.0353, datetime(2018, 9, 1, 8, 45)], ['LTCBTC','bitfinex',0.0093,0.095, datetime(2018, 9, 1, 8, 45)], ['XRPBTC','bitfinex',0.000083, 0.000085, datetime(2018, 9, 1, 8, 45)]
]
dfExchangeBalances = pd.DataFrame(exchangeBalances, columns=['symbol','exchange','balance'])
dfBidOffers = pd.DataFrame(bidOffers, columns=['symbol','exchange','bid', 'offer', 'created'])
dfBidOffers["spread"] = dfBidOffers["bid"] - dfBidOffers["offer"]
dfSummary = dfExchangeBalances.merge(dfBidOffers, how='left', on=['symbol','exchange'])
我需要完成的是,将计算字段添加到“dfSummary”:
currentRow["Spread"] - someOtherRow["Spread"]
“someOtherRow”是基于“created”的查找(例如,具有相同 {symbol, exchange} 但在 30 分钟前“创建”的最后一行(与“currentRow”相比)
澄清:上面的例子是手头实际问题的简化。间隔不完全是 15 分钟。事实上,我需要在 DataFrame 中查找相应的记录(相同的键 = {symbol,exchange}),但首先在第 1 个月、第 1 季度和第 1 年创建这样的记录。
我试图避免手动循环 DataFrame.iter 并改用内置查找的 Pandas(矢量化)
我在想数据框。查找 Pandas 数据框中值的向量化查找 但不确定如何从计算字段的上下文中使用它...?另外,我不想针对不同的DataFrame 进行查找,而是希望针对相同的DataFrame 进行查找。
矢量化(Pandas 和 Numpy - vs 循环):
解决方案
我明白了,这是我的真实代码(所以我没有发布所有内容)。这将起作用(但不确定这是否以最快的方式实施)。
我正在使用DataFrame.apply。这不是向 量化的方式,但应该比在 python 中循环要快得多。有人可以请阐明如何以完全矢量化的方式在下面重写吗?
参考这篇文章 - https://engineering.upside.com/a-beginners-guide-to-optimizing-pandas-code-for-speed-c09ef2c6a4d6
...我无法全神贯注地以矢量化方式重写,并且鉴于查找的性质,我开始觉得下面不能矢量化(很高兴你们中的一个人能证明我错了):
pdPnl = pd.DataFrame.from_records([ObjectUtil.objectPropertiesToDictionary(pnl) for pnl in profitLosses], columns=ObjectUtil.objectPropertiesToDictionary(profitLosses[0]).keys())
pdPnl["TM1"] = pdPnl.apply(lambda rw : rw["COB"] - timedelta(days=1) , axis=1)
pdPnl["MonthStart"] = pdPnl.apply(lambda rw : rw["COB"].replace(day=1), axis=1)
pdPnl["QuarterStart"] = pdPnl.apply(lambda rw : DateTimeUtil.getQuarterStart(rw["COB"], rw["COB"].year), axis=1)
pdPnl["YearStart"] = pdPnl.apply(lambda rw : datetime(rw["COB"].year, 1, 1), axis=1)
pdPnl["DTDRealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["TM1"], rw["InceptionRealizedPnl"], "InceptionRealizedPnl"), axis=1)
pdPnl["DTDUnrealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["TM1"], rw["InceptionUnrealizedPnl"], "InceptionUnrealizedPnl"), axis=1)
pdPnl["MTDRealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["MonthStart"], rw["InceptionRealizedPnl"], "InceptionRealizedPnl"), axis=1)
pdPnl["MTDUnrealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["MonthStart"], rw["InceptionUnrealizedPnl"], "InceptionUnrealizedPnl"), axis=1)
pdPnl["YTDRealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["YearStart"], rw["InceptionRealizedPnl"], "InceptionRealizedPnl"), axis=1)
pdPnl["YTDUnrealizedPnl"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeField(pdPnl, rw["YearStart"], rw["InceptionUnrealizedPnl"], "InceptionUnrealizedPnl"), axis=1)
pdPnl["SharpeRatio"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeSharpeRatio(pdPnl, rw["COB"]), axis=1)
pdPnl["MaxDrawDown"] = pdPnl.apply(lambda rw : PnlCalculatorBase.computeMaxDrawDown(pdPnl, rw["COB"]), axis=1)
pnlDict = pdPnl.to_dict() # Then convert back to List of ProfitLoss (Slow...)
查找函数是:
@staticmethod
def lookUpRow(pdPnl, cob):
return pdPnl[pdPnl["COB"]==cob]
@staticmethod
def computeField(pdPnl, cob, todaysPnl, targetField):
val = np.nan
otherRow = PnlCalculatorBase.lookUpRow(pdPnl, cob)
if otherRow is not None and otherRow[targetField].shape[0]>0:
try:
tm1InceptionRealizedPnl = otherRow[targetField].iloc[0]
val = todaysPnl - tm1InceptionRealizedPnl
except:
# slow...
errMsg = "Failed lookup for " + str(cob) + " " + targetField
logging.error(errMsg)
val = np.nan
return val
@staticmethod
def computeSharpeRatio(pdPnl, cob):
val = None
pdPnl = pdPnl[(pdPnl['COB']<=cob)]
pdPnl = pdPnl.loc[:,["COB", "DTDRealizedPnl","DTDUnrealizedPnl"]]
pdPnl["TotalDTD"] = pdPnl.apply(lambda rw : rw["DTDRealizedPnl"] + rw["DTDUnrealizedPnl"], axis=1)
# @todo, We don't have risk free rate for Sharpe Ration calc. Here's just total DTD avg return over standard deviation
# https://en.wikipedia.org/wiki/Sharpe_ratio
mean = pdPnl["TotalDTD"].mean()
std = pdPnl["TotalDTD"].std()
val = mean / std
return val
@staticmethod
def computeMaxDrawDown(pdPnl, cob):
val = None
pdPnl = pdPnl[(pdPnl['COB']<=cob) & (pdPnl["DTDRealizedPnl"]<0)]
val = pdPnl["DTDRealizedPnl"].min()
return val
推荐阅读
- html - 用 Dart 解析 Html 页面
- asp.net-core - 如何处理多个授权服务器以防止令牌颁发失败
- esp32 - 无法编译 ESP-IDF 示例项目
- javascript - 如何使用 JSON 数据将数据库中的选定文本设置为填充的级联下拉列表
- angular - 以角度将多个值从服务返回到组件
- node.js - 如何为自定义类验证器验证装饰器强制执行某些执行顺序
- node.js - 有没有什么办法可以不使用任何第三方图片优化导出Next JS APP?
- java - 从自定义 gradle 插件中读取 Spring Properties 文件
- async-await - 等待来自 job.await() 的数据在 Composable 内部使用
- graphql - 使用 gaphql faker 伪造 graphql api 似乎没有按 id 查询字段