#上传测试数据 scp path root@ip:path hdfs dfs -mkdir /user/hadoop/input hdfs dfs -put /apache_logs.log /user/hadoop/input hadoop fs -ls /user/hadoop/input #独立数据库初始化元数据库 su hadoop schematool -initSchema -dbType mysql #创建Hive数据库 hive create database log_data_warehouse; use log_data_warehouse; #创建Hive表 CREATE TABLE apache_logs( ipaddr STRING, identity STRING, username STRING, accesstime STRING, request STRING, status STRING, size STRING, referer STRING, agent STRING) PARTITIONED BY (year string,month string,day string) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' WITH SERDEPROPERTIES ( "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (\\[.*\\]) (\".*?\") (-|[0-9]*) (-|[0-9]*) (\".*?\") (\".*?\")", "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s" ) STORED AS TEXTFILE; #新增分区 alter table apache_logs add partition (year='2015', month='05', day='17'); #把数据载入这个分区 load data inpath '/user/hadoop/input/*'into table log_data_warehouse.apache_logs partition (year='2015',month='05',day='17'); #查看数据 select * from log_data_warehouse.apache_logs where year='2015' and month='05' and day='17'; #查看文件 hadoop fs -ls -R /user/hive/warehouse/log_data_warehouse.db/apache_logs/year=2015/month=05/day=17 #migrator_config_validate.yml文件内容 !!com.aliyun.dlf.migrator.app.config.MigratorConfig clientInfo: accessKeyId: LTAI5t*******TpCptuUhAyc accessKeySecret: UXCxc1I*******ITOd7Jo0HglR1H endPoint: dlf-vpc.cn-shanghai.aliyuncs.com regionId: cn-shanghai catalogId: 1564648900518939 mysql: connectionUri: jdbc:mysql://rm-uf62*******bbdvbkl7.mysql.rds.aliyuncs.com/hive_meta driver: com.mysql.cj.jdbc.Driver userName: xianwang2019 password: Te******4 runOptions: batchSize: 200 debug: false deleteBeforeCreate: false lowerCaseTableNames: true schema: hive_meta records: oss://testbp-106/validate objectTypes: - database - function - table - partition operations: - validate locationMappings: - source: hdfs://user/hive/warehouse/ target: oss://testbp-106/ #上传为hdfs文件 hdfs dfs -put /migrator_config_validate.yml /user/hadoop/ #执行校验工具 spark-submit \ --master yarn \ --deploy-mode client \ --driver-memory 2G \ --executor-memory 2G \ --executor-cores 4 \ --num-executors 3 \ --conf spark.sql.shuffle.partitions=200 \ --conf spark.kryoserializer.buffer.max=128m \ --conf spark.sql.autoBroadcastJoinThreshold=-1 \ --conf spark.sql.adaptive.enabled=false \ --conf spark.sql.adaptive.join.enabled=false \ --conf spark.executor.heartbeatInterval=600s \ --conf spark.network.timeout=3600s \ --class com.aliyun.dlf.migrator.app.MigratorApplication \ /dlf-migrator-tools-1.0.0-RC1.jar \ /user/hadoop/migrator_config_validate.yml #创建验证库 create table validate_database_result (leftResult string, rightResult string, status boolean, errorMessage string, fixResult string) stored as parquet location 'oss://testbp-106/validate/Databases'; create table validate_table_result (leftResult string, rightResult string, status boolean, errorMessage string, fixResult string) stored as parquet location 'oss://testbp-106/validate/Tables'; create table validate_function_result (leftResult string, rightResult string, status boolean, errorMessage string, fixResult string) stored as parquet location 'oss://testbp-106/validate/Functions'; create table validate_partition_result (leftResult string, rightResult string, status boolean, errorMessage string, fixResult string) stored as parquet location 'oss://testbp-106/validate/Partitions'; #验证结果表是否一致 select count(*) from validate_database_result where status = false; select count(*) from validate_table_result where status = false; select count(*) from validate_function_result where status = false; select count(*) from validate_partition_result where status = false; #如果有数据不一致,errorMessage字段表示不一致的具体情况 select * from validate_database_result where status = false limit 10; select * from validate_table_result where status = false limit 10; select * from validate_function_result where status = false limit 10; select * from validate_partition_result where status = false limit 10; #Hive-site.xml修改 hive.imetastoreclient.factory.class=com.aliyun.datalake.metastore.hive2.DlfMetaStoreClientFactory dlf.catalog.endpoint=dlf-vpc.cn-shanghai.aliyuncs.com dlf.catalog.uid=156******8939 dlf.catalog.region=cn-shanghai dlf.catalog.akMode=EMR_AUTO dlf.catalog.sts.isNewMode=true dlf.catalog.proxyMode=DLF_ONLY HiveMetastore迁移DLF 涉及双写模式 METASTORE_ONLY, // read/write Metastore Only METASTORE_DLF_FAILURE, // read/write Metastore first and write dlf allow failure METASTORE_DLF_SUCCESS, // read/write metastore first and write dlf must success DLF_METASTORE_SUCCESS, // read/write dlf first and write metastore must success DLF_METASTORE_FAILURE, // read/write dlf first and write metastore allow failure DLF_ONLY, // read/write dlf only #验证数据查询 use log_data_warehouse; select * from log_data_warehouse.apache_logs where year='2015' and month='05' and day='17';