本文只讨论如何使用kudu提供的Python相关api,不涉及kudu自身环境的搭建和配置。
环境准备
注意:在安装kudu-python之前需要先确保已经配置好了kudu的C++ Client Libraries,并且不同的操作系统之间的依赖是需要分别配置的,这里只讨论Ubuntu和Centos。
C++ Client Libraries
详情请参考官网:kudu C++ Client
Libraries
Ubuntu
sudo apt-get -y install autoconf automake curl flex g++ gcc gdb git \
krb5-admin-server krb5-kdc krb5-user libkrb5-dev libsasl2-dev libsasl2-modules \
libsasl2-modules-gssapi-mit libssl-dev libtool lsb-release make ntp \
openjdk-8-jdk openssl patch pkg-config python rsync unzip vim-common
git clone https://github.com/apache/kudu
cd kudu
thirdparty/build-if-necessary.sh
mkdir -p build/release
cd build/release
../../thirdparty/installed/common/bin/cmake \
-DCMAKE_BUILD_TYPE=release ../..
make -j4
Centos
sudo yum -y install autoconf automake cyrus-sasl-devel cyrus-sasl-gssapi \
cyrus-sasl-plain flex gcc gcc-c++ gdb git java-1.8.0-openjdk-devel \
krb5-server krb5-workstation libtool make openssl-devel patch pkgconfig \
redhat-lsb-core rsync unzip vim-common which
DTLS_RPM=rhscl-devtoolset-3-epel-6-x86_64-1-2.noarch.rpm
DTLS_RPM_URL=https://www.softwarecollections.org/repos/rhscl/devtoolset-3/epel-6-x86_64/noarch/${DTLS_RPM}
wget ${DTLS_RPM_URL} -O ${DTLS_RPM}
sudo yum install -y scl-utils ${DTLS_RPM}
sudo yum install -y devtoolset-3-toolchain
git clone https://github.com/apache/kudu
cd kudu
build-support/enable_devtoolset.sh thirdparty/build-if-necessary.sh
mkdir -p build/release
cd build/release
../../build-support/enable_devtoolset.sh \
../../thirdparty/installed/common/bin/cmake \
-DCMAKE_BUILD_TYPE=release \
../..
make -j4
kudu-python
首次安装
在安装完kudu-python需要的相关依赖后,并不要急于安装kudu-python的包,以上提供的kudu目录实在是太大了,大约有60G,不可能每次都去重新安装或者拷贝一遍。我们先把kudu移动到方便的目录下,并且保证之后的每台机器的kudu依赖都安装于此,之后设置一下KUDU_HOME。随后就可以开始pip install kudu-python了。
安装命令运行完后记得将wheel文件的位置记录下来,其他机器上kudu-pyhon的安装都要依靠这个文件。
复用安装
光有一个wheel文件是不够的,虽然能够成功地安装kudu-python了,在真正运行的时候会出现找不大so文件的异常。这种时候就需要在之前安装kudu的机器上将$KUDU_HOME/build/latest/lib/exported下的所有文件拷贝到需要安装的机器的同样目录下,pip install wheel,之后即可正常运行。
需要注意的是,Ubuntu和Centos之间是不能互相使用wheel文件和so文件的,需要各自进行操作,但是他们的wheel文件名是一样的,使用docker安装的小伙伴如果将其拷贝到宿主机时不要放在同一目录下。
使用
import kudu
from kudu.client import Partitioning
from kudu.schema import Schema
type_mapper = {
"int8": kudu.int8,
"int16": kudu.int16,
"int32": kudu.int32,
"int64": kudu.int64,
"float": kudu.float,
"double": kudu.double,
"decimal": kudu.decimal,
"binary": kudu.binary,
"string": kudu.string
}
class KuduClient:
_instance = None
def __new__(cls, *args, **kwargs):
if not cls._instance:
return object.__new__(cls)
return cls._instance
def __init__(self):
# Connect to Kudu master server
self.client = kudu.connect(host={kudu_host}, port={kudu_port})
self.session = self.client.new_session() # session没有关闭的方法,但是可以设置过期时间
@staticmethod
def builder() -> kudu.schema:
return kudu.schema_builder()
@staticmethod
def schema(builder: kudu.schema, columns: list) -> kudu.schema:
"""
:param builder:
:param columns: [
{
"name": "student_no",
"type": "int32",
"nullable": False,
"primary_key": True
}, {
"name": "age",
"type": "int8",
"nullable": False,
"primary_key": True
}, {
"name": "name",
"type": "string",
"nullable": True
}, {
"name": "gender",
"type": "string",
"nullable": True
}
]
:return:
"""
primary_key = []
for column in columns:
if column.get("primary_key"):
primary_key.append(column.get("name"))
builder.add_column(
name=column.get("name"),
type_=type_mapper.get(column.get("type")),
nullable=False if not column.get("nullable") else True,
compression=column.get("compression"),
encoding=column.get("encoding"),
default=column.get("default"),
block_size=column.get("block_size"),
precision=column.get("precision"),
scale=column.get("scale")
)
builder.set_primary_keys(primary_key)
return builder.build()
@staticmethod
def partition(hash_columns: list, range_columns: list = None, bound: dict = None, bucket_num=3) -> Partitioning:
# Define partitioning schema
partition = Partitioning()
for column in hash_columns:
partition.add_hash_partitions(column_names=column, num_buckets=bucket_num)
partition.set_range_partition_columns(range_columns)
# partition.add_range_partition_split(range_columns)
partition.add_range_partition(
lower_bound=bound.get("lower_bound"),
upper_bound=bound.get("upper_bound"),
lower_bound_type=bound.get("lower_bound_type") or "inclusive",
upper_bound_type=bound.get("upper_bound_type") or "exclusive"
)
return partition
def add_range_partition(self, table: kudu.Table, bound: dict) -> None:
"""
:param table:
:param bound:{
"lower_bound": {"create_time": datetime.datetime.now().strftime("%Y-%m-%d 00:00:00")},
"upper_bound": {"create_time": datetime.datetime.now().strftime("%Y-%m-%d 23:59:59")}
}
:return:
"""
alter = self.conn.new_table_alterer(table)
alter.add_range_partition(
lower_bound=bound.get("lower_bound"),
upper_bound=bound.get("upper_bound"),
lower_bound_type=bound.get("lower_bound_type") or "inclusive",
upper_bound_type=bound.get("upper_bound_type") or "exclusive"
)
alter.alter()
def drop_range_partition(self, table: kudu.Table, bound: dict) -> None:
alter = self.conn.new_table_alterer(table)
alter.drop_range_partition(
lower_bound=bound.get("lower_bound"),
upper_bound=bound.get("upper_bound"),
lower_bound_type=bound.get("lower_bound_type") or "inclusive",
upper_bound_type=bound.get("upper_bound_type") or "exclusive"
)
alter.alter()
def show_tables(self) -> list:
return self.conn.list_tables()
def create_table(self, table_name: str, schema: kudu.schema, partition: Partitioning, replica=3) -> None:
# Create new table
self.conn.create_table(table_name, schema, partition, replica)
def drop_table(self, table_name: str) -> None:
self.conn.delete_table(table_name)
def table(self, table_name: str) -> kudu.Table:
# Open a table
return self.conn.table(table_name)
def table(self, table_name: str) -> kudu.Table:
# Open a table
return self.client.table(table_name)
def insert(self, table: kudu.Table, rows: list) -> None:
"""
:param table:
:param rows: [{"student_no": 11, "age": 12, "name": "amy"}]
:return:
"""
for row in rows:
op = table.new_insert(row)
self.session.apply(op)
try:
self.session.flush()
except kudu.KuduBadStatus:
return self.session.get_pending_errors()
@classmethod
def __del(cls):
cls._instance = None
def __del__(self):
self.client.close()
self.__del()
if __name__ == '__main__':
import time
client = KuduClient()
builder = client.builder()
columns = [
{
"name": "student_no",
"type": "int32",
"nullable": False,
"primary_key": True
}, {
"name": "age",
"type": "int8",
"nullable": False,
"primary_key": True
}, {
"name": "create_time",
"type": "string",
"nullable": False,
"primary_key": True
}, {
"name": "name",
"type": "string",
"nullable": True
}, {
"name": "gender",
"type": "string",
"nullable": True
}
]
# bound一定要用dict写,用list会莫名其妙把第一个定义的主键加入range partition
bound = {
"lower_bound": {"create_time": datetime.datetime.now().strftime("%Y-%m-%d 00:00:00")},
"upper_bound": {"create_time": datetime.datetime.now().strftime("%Y-%m-%d 23:59:59")}
}
schema = client.schema(builder, columns)
partition = client.partition(["student_no", "age"], ["create_time"], bound)
print(partition.__dict__)
client.drop_table("python_kudu_test")
client.create_table("python_kudu_test", schema, partition)
table = client.table("python_kudu_test")
# client.add_range_partition(table, bound)
# 要确保插入的数据在range partition的范围内,否则无法插入也不会报错
client.insert(table, [{"student_no": 11, "age": 12, "name": "amy", "create_time": ""}])
# print(client.show_tables())
总结
以上仅是个人在使用kudu-python时踩坑的一些心得,如果有没注意到的地方欢迎各位指正。
来源:CSDN
作者:dhiuwha
链接:https://blog.csdn.net/dhiuwha/article/details/103825140