遇到一个场景,要
根据oracle表结构创建hive表
。
本来想写一个根据oracle信息和表名自动在hive中创建表的脚本。如果在集群运行,python操作oracle的cx_Oracle在集群的每个结点都需要安装,时间紧急,干活要紧,先凑合写个脚本本地用着吧
class GetHiveDDLFromOracleMeta:
def get_oralce_2_hive_datatype_map(self):
type_map = {"INTEGER": "double",
"NUMBER": "double",
"NUMBER(10)": "bigint",
"NUMBER(11)": "bigint",
"NUMBER(12)": "bigint",
"NUMBER(13)": "bigint",
"NUMBER(14)": "bigint",
"NUMBER(15)": "bigint",
"NUMBER(16)": "bigint",
"NUMBER(17)": "bigint",
"NUMBER(18)": "bigint",
"NUMBER(5)": "int",
"NUMBER(6)": "int",
"NUMBER(7)": "int",
"NUMBER(8)": "int",
"NUMBER(9)": "int",
"NUMBER(2)": "smallint",
"NUMBER(3)": "smallint",
"NUMBER(4)": "smallint",
"NUMBER(P,S)": "decimal",
"NUMBER(1)": "tinyint",
"FLOAT": "double",
"BINARY_FLOAT": "double",
"BINARY_DOUBLE": "double",
"DATE": "timestamp",
"TIMESTAMP(1)": "timestamp",
"TIMESTAMP(2)": "timestamp",
"TIMESTAMP(3)": "timestamp",
"TIMESTAMP(4)": "timestamp",
"TIMESTAMP(5)": "timestamp",
"TIMESTAMP(6)": "timestamp",
"CHAR": "string",
"NCHAR": "string",
"NVARCHAR2": "string",
"VARCHAR2": "string",
"NVARCHAR": "string",
"BLOB": "", "BFILE": "", "NCLOB": "","CLOB": "string", "ROWID": "", "UROWID": "", "TIMESTAMP with time zone": "",
"TIMESTAMP with local time zone": "", "ANYDATA": "", "VARRAY": "", "NESTEDTAB": "", "Object": ""
, "REF": "", "RAW": ""}
return type_map
def split_oracl_meta(self, oracle_meta):
name_types = []
for name_type in oracle_meta.split(","):
column_name = name_type.split(" ")[0]
column_type = name_type.split(" ")[1]
name_types.append((column_name, column_type))
return name_types
def transfrom_from_oracle_to_hive(self, oracle_meta_maps):
oracle_hive_maps = self.get_oralce_2_hive_datatype_map()
hive_name_type_maps = []
for name_type in oracle_meta_maps:
column_name = name_type[0]
oracle_type = name_type[1]
hive_type = oracle_hive_maps[oracle_type]
if hive_type:
print(oracle_hive_maps[oracle_type])
hive_name_type_maps.append((column_name, hive_type))
else:
raise Exception(" ERROR!!! no map column and type is " + str(name_type))
return hive_name_type_maps
def get_dml_from_hive_name_type_maps(self, hive_name_type_maps, hive_table_name):
dml_sql_prefix = "create external table " + hive_table_name + " ("
dml_sql_middle = ""
for name_type in hive_name_type_maps:
column_name = name_type[0]
column_type = name_type[1]
dml_sql_middle = dml_sql_middle + column_name + " " + column_type + ", "
dml_sql_middle = dml_sql_middle[0:len(dml_sql_middle)-2]
dml_sql_suffix = ") " \
" row format delimited fields terminated by '\t' stored as textfile ;"
sql = dml_sql_prefix + dml_sql_middle + dml_sql_suffix
return sql
def create_hive_table_dml_by_oracle_meta(self, oracle_meta, hive_table_name):
oracle_name_types = self.split_oracl_meta(oracle_meta)
hive_name_types = self.transfrom_from_oracle_to_hive(oracle_name_types)
hive_table_dml_sql = self.get_dml_from_hive_name_type_maps(hive_name_types,hive_table_name)
return hive_table_dml_sql
if __name__ == "__main__":
this_obj = GetHiveDDLFromOracleMeta()
oracle_meta = "BIAOZHIPAIDIANNAOBIANHAO NUMBER,BIAOZHIPAIFUKAYOUXIAOQICONG TIMESTAMP(6),BIAOZHIPAIFUKAYOUXIAOQIZHI TIMESTAMP(6),BIAOZHIPAIHAOMA VARCHAR2,BIAOZHIPAILEIXING VARCHAR2,CHELIANGDIANNAOBIANHAO NUMBER,CHELIANGTUICHURIQI TIMESTAMP(6),CHEXINGXIANZHI VARCHAR2,CUSTOM_OPER_TYPE VARCHAR2,CUSTOM_UPDATE_TIME TIMESTAMP(6),DAIYONGKAYOUXIAOQICONG TIMESTAMP(6),DAIYONGKAYOUXIAOQIZHI TIMESTAMP(6),FUKAHAO VARCHAR2,GUANLIJIBIE VARCHAR2,JINGYINGQIXIAN TIMESTAMP(6),JINGYQUANDANWEIDIANNAOBIANHAO NUMBER,NITOURUCHELIANGRIQI TIMESTAMP(6),PIWENHAO VARCHAR2,PIZHUNJIGUAN VARCHAR2,PIZHUNSHIJIAN TIMESTAMP(6),PM_PRIMARYKEY NUMBER,QIDIANDI VARCHAR2,SHENQINGRIQI TIMESTAMP(6),SHIFOUYIFAPAI VARCHAR2,SHIYONGDANWEIDIANNAOBIANHAO NUMBER,SHOUCITOURUSHIJIAN TIMESTAMP(6),SHUJUZHUANGTAI VARCHAR2,SYS_XINZENGREN VARCHAR2,SYS_XINZENGSHIJIAN TIMESTAMP(6),SYS_ZUIJINXIUGAIREN VARCHAR2,SYS_ZUIJINXIUGAISHIJIAN TIMESTAMP(6),XIAQUSHENG VARCHAR2,XIAQUSHI VARCHAR2,XIAQUXIAN VARCHAR2,XIAQUZHEN VARCHAR2,XUKERIQI TIMESTAMP(6),YUANCHEPAIHAO VARCHAR2,ZHIBIAODIANNAOBIANHAO NUMBER,ZHIBIAOYOUXIAORIQI TIMESTAMP(6),ZHIBIAOZHONGLEI VARCHAR2,ZHUANGBEITIAOJIAN VARCHAR2,ZHUANGTAI VARCHAR2,ZUIDICHELIANGJISHUDENGJI VARCHAR2,ZUIDICHELIANGZHUANGBEIDENGJI VARCHAR2"
hive_table_name = "ST_DLYS_KEYUNBAOCHEXIANLU"
results = this_obj.create_hive_table_dml_by_oracle_meta(oracle_meta, hive_table_name)
print(type(results))
print(results)
注意:TABLE_NAME请改成创建hive表依据的oracle表名
select Listagg(concat(concat(A.COLUMN_NAME,' '),A.DATA_TYPE), ',') WITHIN GROUP(ORDER BY column_name)
from user_tab_columns A
where table_name = upper('TABLE_NAME');
查询结果类似这样
BIAOZHIPAIDIANNAOBIANHAO NUMBER,BIAOZHIPAIFUKAYOUXIAOQICONG TIMESTAMP(6),BIAOZHIPAIFUKAYOUXIAOQIZHI TIMESTAMP(6),BIAOZHIPAIHAOMA VARCHAR2,BIAOZHIPAILEIXING VARCHAR2,CHELIANGDIANNAOBIANHAO NUMBER,CHELIANGTUICHURIQI TIMESTAMP(6),CHEXINGXIANZHI VARCHAR2,CUSTOM_OPER_TYPE VARCHAR2,CUSTOM_UPDATE_TIME TIMESTAMP(6),DAIYONGKAYOUXIAOQICONG TIMESTAMP(6),DAIYONGKAYOUXIAOQIZHI TIMESTAMP(6),FUKAHAO VARCHAR2,GUANLIJIBIE VARCHAR2,JINGYINGQIXIAN TIMESTAMP(6),JINGYQUANDANWEIDIANNAOBIANHAO NUMBER,NITOURUCHELIANGRIQI TIMESTAMP(6),PIWENHAO VARCHAR2,PIZHUNJIGUAN VARCHAR2,PIZHUNSHIJIAN TIMESTAMP(6),PM_PRIMARYKEY NUMBER,QIDIANDI VARCHAR2,SHENQINGRIQI TIMESTAMP(6),SHIFOUYIFAPAI VARCHAR2,SHIYONGDANWEIDIANNAOBIANHAO NUMBER,SHOUCITOURUSHIJIAN TIMESTAMP(6),SHUJUZHUANGTAI VARCHAR2,SYS_XINZENGREN VARCHAR2,SYS_XINZENGSHIJIAN TIMESTAMP(6),SYS_ZUIJINXIUGAIREN VARCHAR2,SYS_ZUIJINXIUGAISHIJIAN TIMESTAMP(6),XIAQUSHENG VARCHAR2,XIAQUSHI VARCHAR2,XIAQUXIAN VARCHAR2,XIAQUZHEN VARCHAR2,XUKERIQI TIMESTAMP(6),YUANCHEPAIHAO VARCHAR2,ZHIBIAODIANNAOBIANHAO NUMBER,ZHIBIAOYOUXIAORIQI TIMESTAMP(6),ZHIBIAOZHONGLEI VARCHAR2,ZHUANGBEITIAOJIAN VARCHAR2,ZHUANGTAI VARCHAR2,ZUIDICHELIANGJISHUDENGJI VARCHAR2,ZUIDICHELIANGZHUANGBEIDENGJI VARCHAR2
复制查询结果
- 用上面复制的字段名和类型拼接成的字符串替换main函数里的参数oracle_meta=右侧的字符串
- 修改参数hive_table_name=右侧的表名为想要创建的hive表名
- (以py文件名是GetHiveDDLFromOracleMeta为例)在pycharm里run或者打开cmd输入$PYTHON_HOME/python GetHiveDDLFromOracleMeta.py
- pycharm控制台或者cmd输出的create语句即为所需的hive建表语句了
后面有时间会把复制sql查询结果这一步用查询oracle的方法替代。hive建表语句也不需要粘贴出来自己运行,而是直接自动创建。
一、问题遇到一个场景,要根据oracle表结构创建hive表。本来想写一个根据oracle信息和表名自动在hive中创建表的脚本。如果在集群运行,python操作oracle的cx_Oracle在集群的每个结点都需要安装,时间紧急,干活要紧,先凑合写个脚本本地用着吧二、解决# -*- coding=utf8 -*-class GetHiveDDLFromOracleMeta: def get_oralce_2_hive_datatype_map(self): type_m
本文主要给大家介绍了关于Mysql元数据生成Hive建表语句注释脚本的相关内容,分享出来供大家参考学习,下面话不多说了,来一起看看详细的介绍:
最近在将数据从Mysql 等其他关系型数据库 抽取到Hive 表中时,需要同步mysql表中的注释,以下脚本可以生成hive表字段注释修改语句。
注:其他关系型数据库如:oracle 可以通过相同的思路,读取元数据,修改脚本语法实现。
在mysql元数据库:information_schema 中执行以下语句
SELECT CONCAT('alter table ', TABLE_NAME, ' CHANGE COLUMN ', COL
在使用cx_Oracle连接正式Oracle的时候出现无法连接的问题,通过咨询DBA同事发现这台DB采用的是rac集群的方式搭建,因此连接的时候就应该使用另外的方式。
import cx_Oracle
tns = cx_Oracle.makedsn('0.0.0.0', '1521', service_name = 'racdb')
conn = cx_Oracle.connect('user',
首先,你需要安装对应的数据库驱动。对于 MySQL 和 MariaDB,你可以使用 PyMySQL 驱动;对于 PostgreSQL,你可以使用 psycopg2 驱动;对于 Oracle,你可以使用 cx_Oracle 驱动;对于 Kingbase,你可以使用 pyodbc 驱动;对于 Hive,你可以使用 PyHive 驱动。
具体的安装方法可以参考各自的文档,这里不再赘述。
然后,你可以使用...
4 import java.io.FileOutputStream;
5 import java.sql.Connection;
6 import java.sql.DriverManager;
7 import java.sql.ResultSet;
8 import java.sql.SQLException;
9 import java.sql.Statement;
10 import java...