datatable-让Python数据分析更快 (101个高频操作)
之前分享过一篇 谁是Python/R/Julia数据处理工具库中的最强武器? ,结果显示呼声很高的pandas性能并没那么优秀
本文主角
:Python的
datatable
,在一定程度上不乏为pandas有力竞争者,其模仿R中data.table的核心算法和接口,
致力于更快的、处理size更大的数据。
这里分享
datatable的101个常用操作
,助快速上手datatable。
0、安装
pip install datatable
1、加载datatable、查看版本号
import datatable as dt
dt.__version__
2、三种方式创建datatable.Frame
import pandas as pd
import numpy as np
import datatable as dt
# Inputs
my_list = list('abcedfghijklmnopqrstuvwxyz')
my_arr = np.arange(26)
my_df = pd.DataFrame(dict(col1=my_list, col2=my_arr))
# Solution
dt_df1 = dt.Frame(my_list) #list创建
dt_df2 = dt.Frame(my_arr) #numpy.ndarray
dt_df3 = dt.Frame(my_df) #pandas.DataFrame
dt_df4 = dt.Frame(A=my_arr, B=my_list)
3、读取csv文件为datatable.Frame
import datatable as dt
df = dt.fread(
'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv'
df.head(5)
左下方会默认显示行列数,这是pandas不具有的~
4、 读取csv文件前5行
import datatable as dt
df = dt.fread(
'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
max_nrows=5)
5、为datatable.Frame新增一列
# Input
import datatable as dt
df = dt.fread(
'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
max_nrows=5)
# Solution
df[:, "new_column"] = dt.Frame([1, 2, 3, 4, 5]) #新增一列new_column
6、取已有列创建新列
# Input
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
# Solution
df[:, "new_column"] = df[:, dt.f.age + dt.f.rad]
df.head()
7、取已有列整数部分创建新列
# Input
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
# Solution
df[:, "new_column"] = df[:, dt.int32(dt.f.dis)]
df.head(5)
8、按条件创建新列
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
#age列,年龄大于60赋值old,反之为new
df[:, "new_column"] = dt.Frame(np.where(df[:, dt.f.age > 60], 'Old', 'New'))
df.head(5)
9、left join两个datatable.Frame
import datatable as dt
df1 = dt.Frame(A=[1, 2, 3, 4], B=["a", "b", "c", "d"])
df2 = dt.Frame(A=[1, 2, 3, 4, 5], C=["a2", "b2", "c2", "d2", "e2"])
df2.key = "A"
output = df1[:, :, dt.join(df2)]
print(df1)
print(df2)
output
10、修改列名称
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
df.names = {'zn': 'zn_new'}
df.head(5)
11、每隔1行读取csv文件
import datatable as dt
import csv
with open('datasets-master/BostonHousing.csv', 'r') as f:
reader = csv.reader(f)
for i, row in enumerate(reader):
row = [[x] for x in row]
# 1st row
if i == 0:
df = dt.Frame(row)
header = [x[0] for x in df[0, :].to_list()]
df.names = header
del df[0, :]
# Every 2th row
elif i % 2 == 0:
df_temp = dt.Frame(row)
df_temp.names = header
df.rbind(df_temp)
df.head(5)
原始数据
12、读入csv文件按条件修改列
import datatable as dt
import csv
with open('datasets-master/BostonHousing.csv', 'r') as f:
reader = csv.reader(f)
for i, row in enumerate(reader):
row = [[x] for x in row]
if i == 0:
df = dt.Frame(row)
header = [x[0] for x in df[0, :].to_list()]
df.names = header
del df[0, :]
else:
row[13] = ['High'] if float(row[13][0]) > 25 else ['Low'] # 最后一列大于25赋值High,否则赋值为Low
df_temp = dt.Frame(row)
df_temp.names = header
df.rbind(df_temp)
df.head(5)
13、修改特定位置的值
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
df[2, 1] = 5
df.head(5)
14、datatable.Frame增删操作
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
#删除特定位置值
del df[2, 1]
del df[3, :]
# 删除列
del df[:, "chas"]
# 按条件删除
del df[dt.f.zn == 0, :]
df.head(5)
15、datatable.Frame与列表/字典/DataFrame等格式互转
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
# to pandas df
pd_df = df.to_pandas()
# to numpy arrays
np_arrays = df.to_numpy()
# to dictionary
dic = df.to_dict()
# to list
list_ = df[:,"indus"].to_list()
# to tuple
tuples_ = df[:,"indus"].to_tuples()
# to csv
df.to_csv("BostonHousing.csv")
16、获取datatable.Frame各列数据类型
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
for i in range(len(df.names)):
print(df.names[i], ":", df.stypes[i])
17、datatable.Frame基础统计计算
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
df.sum()
df.max()
df.min()
df.mean()
df.sd()
df.mode()
df.nmodal()
df.nunique()
18、datatable.Frame中使用group by
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
df[:, dt.mean(dt.f.Price), dt.by("Manufacturer")].head(5)
19、 datatable.Frame按升序sort
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
# 方法1
df.sort("Price")
# 方法2
df[:,:, dt.sort(dt.f.Price)].head(5)
20、 datatable.Frame按降序sort
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
# Solution
df[::-1, :, dt.sort(dt.f.Price)].head()
21、数据重复/追加5次
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
# Solution
dt.repeat(df, 5)
22、字符串替换
import datatable as dt
df = dt.fread('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
# Solution
df.replace("Audi", "My Dream Car")
df.head(5)
23、按条件提取值
# Input
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
# Solution
# Get the highest price
print("Highest Price : ", df[:, dt.f.Price].max()[0, 0])
# Get Manufacturer with highest price
df[dt.f.Price == df[:, dt.f.Price].max()[0, 0],
['Manufacturer', 'Model', 'Type']]
24、修改列名
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
# Solution
old_col_name = "Model"
new_col_name = "Car Model"
df.names = [new_col_name if x == old_col_name else x for x in df.names]
df.head(5)
25、统计每列NA值
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
# Solution
df.countna()
26、取一列
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
# Solution
df[:, "Model"].head(5)
27、列顺序颠倒
# Input
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
# Solution 1
df.head()
df[:,::-1].head(5)
28、格式化输出
import datatable as dt
df = dt.Frame(random=np.random.random(4)**10)
df[:, "random2"] = dt.Frame(['%.6f' % x for x in df[:, "random"].to_list()[0]])
29、每隔20行按条件过滤
# Input
import datatable as dt
df = dt.fread('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
# Solution
df[::20, ['Manufacturer', 'Model', 'Type']]
30、rows颠倒
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
print(df.head())
# Solution
df[::-1,:]
31、按指定值归一化数据
# Input
import datatable as dt
df = dt.fread("datasets-master/BostonHousing.csv")
# Solution
for i in df.names:
df[:,i] = df[:,(dt.f[i] - df[:,dt.min(dt.f[i])][0,0])/(df[:,dt.max(dt.f[i])][0,0] - df[:,dt.min(dt.f[i])][0,0])]
df.head(5)
32、group并取均值生成新列
# Input
import datatable as dt
df = dt.Frame(fruit=['apple', 'banana', 'orange'] * 3,
rating=np.random.rand(9),
price=np.random.randint(0, 15, 9))
df[:, dt.mean(dt.f.price), dt.by("fruit")]
33、按两列join
# Input
import datatable as dt
df1 = dt.Frame(A=[1, 2, 3, 4], B=["a", "b", "c", "d"], D=[1, 2, 3, 4])
df2 = dt.Frame(A=[1, 2, 4, 5],
B=["a", "b", "d", "e"],
C=["a2", "b2", "d2", "e2"])
# Solution
df2.key = ["A", "B"]
output = df1[:, :, dt.join(df2)]
output
34、ML小案例
import datatable as dt
from datatable.models import Ftrl
# Import data
train_df = dt.fread('pima_indian_diabetes_training_data.csv')
test_df = dt.fread('pima_indian_diabetes_testing_data.csv')
# Create Ftrl model
ftrl_model = Ftrl()
# add parameter values while creating model
ftrl_model = Ftrl(alpha=0.1, lambda1=0.5, lambda2=0.6)
# change paramter of existing model
ftrl_model.alpha = 0.1
ftrl_model.lambda1 = 0.5
ftrl_model.lambda2 = 0.6
# Prepare training and test dataset
train_df[:, "diabetes"] = dt.Frame(
np.where(train_df[:, dt.f["diabetes"] == "pos"], 1, 0))
test_df[:, "diabetes"] = dt.Frame(
np.where(test_df[:, dt.f["diabetes"] == "pos"], 1, 0))
x_train = train_df[:, [
"pregnant", "glucose", "pressure", "mass", "pedigree", "age"
y_train = train_df[:, ["diabetes"]]
x_test = test_df[:, [
"pregnant", "glucose", "pressure", "mass", "pedigree", "age"
y_test = test_df[:, ["diabetes"]]
# training the model
ftrl_model.fit(x_train, y_train)
# predictions of the model
targets = ftrl_model.predict(x_test)