base path
. The dataset is divided into partitions, which are folders containing data files for that partition. This is much like a Hive table.partition path
, which is relative to the base path. In each partition, files are organized into file groups, which are uniquely identified by file IDs. Each file group contains multiple file slices. Each file slice contains a base file *.parquet
, which is a columnar file generated at a certain commit or compaction instant time, and a set of *.log*
files that contain upserts to the base file since it was generated.Trade-Off | Copy on Write | Merge on Read |
Data latency | Higher | Lower |
Update cost (I/O) | Higher (rewrite the entire Parquet file) | Lower (append to incremental logs) |
Parquet file size | Smaller (high update (I/O)) | Larger (low update cost) |
Write amplification | Higher | Lower (depending on compaction strategies) |
hadoop
user.cd /usr/local/service/hudiln -s /usr/local/service/spark/conf/spark-defaults.conf /usr/local/service/hudi/demo/config/spark-defaults.conf
hdfs dfs -mkdir -p /hudi/confighdfs dfs -copyFromLocal demo/config/* /hudi/config/
/usr/local/service/hudi/demo/config/kafka-source.propertiesbootstrap.servers=kafka_ip:kafka_port
cat demo/data/batch_1.json | kafkacat -b [kafka_ip] -t stock_ticks -P
spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer --master yarn ./hudi-utilities-bundle_2.11-0.5.1-incubating.jar --table-type COPY_ON_WRITE --source-class org.apache.hudi.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /usr/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props /hudi/config/kafka-source.properties --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProviderspark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer --master yarn ./hudi-utilities-bundle_2.11-0.5.1-incubating.jar --table-type MERGE_ON_READ --source-class org.apache.hudi.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /usr/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props /hudi/config/kafka-source.properties --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider --disable-compaction
hdfs dfs -ls /usr/hive/warehouse/
bin/run_sync_tool.sh --jdbc-url jdbc:hive2://[hiveserver2_ip:hiveserver2_port] --user hadoop --pass [password] --partitioned-by dt --base-path /usr/hive/warehouse/stock_ticks_cow --database default --table stock_ticks_cowbin/run_sync_tool.sh --jdbc-url jdbc:hive2://[hiveserver2_ip:hiveserver2_port] --user hadoop --pass [password]--partitioned-by dt --base-path /usr/hive/warehouse/stock_ticks_mor --database default --table stock_ticks_mor --skip-ro-suffix
beeline -u jdbc:hive2://[hiveserver2_ip:hiveserver2_port] -n hadoop --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
spark-sql --master yarn --conf spark.sql.hive.convertMetastoreParquet=false
select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG';select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
/usr/local/service/presto-client/presto --server localhost:9000 --catalog hive --schema default --user Hadoop
"_hoodie_commit_time"
. Execute the following SQL statements:select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG';select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
cat demo/data/batch_2.json | kafkacat -b 10.0.1.70 -t stock_ticks -P
spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer --master yarn ./hudi-utilities-bundle_2.11-0.5.1-incubating.jar --table-type COPY_ON_WRITE --source-class org.apache.hudi.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /usr/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props /hudi/config/kafka-source.properties --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProviderspark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer --master yarn ./hudi-utilities-bundle_2.11-0.5.1-incubating.jar --table-type MERGE_ON_READ --source-class org.apache.hudi.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /usr/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props /hudi/config/kafka-source.properties --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider --disable-compaction
cli/bin/hudi-cli.shconnect --path /usr/hive/warehouse/stock_ticks_morcompactions show allcompaction scheduleCompact execution plans.compaction run --compactionInstant [requestID] --parallelism 2 --sparkMemory 1G --schemaFilePath /hudi/config/schema.avsc --retry 1
beeline -u jdbc:hive2://[hiveserver2_ip:hiveserver2_port] -n hadoop --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=falseset hive.execution.engine=tez;set hive.execution.engine=spark;
cosn://[bucket]
before the storage path. Example:bin/kafka-server-start.sh config/server.properties &cat demo/data/batch_1.json | kafkacat -b kafkaip -t stock_ticks -Pcat demo/data/batch_2.json | kafkacat -b kafkaip -t stock_ticks -Pkafkacat -b kafkaip -Lhdfs dfs -mkdir -p cosn://[bucket]/hudi/confighdfs dfs -copyFromLocal demo/config/* cosn://[bucket]/hudi/config/spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer --master yarn ./hudi-utilities-bundle_2.11-0.5.1-incubating.jar --table-type COPY_ON_WRITE --source-class org.apache.hudi.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path cosn://[bucket]/usr/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props cosn://[bucket]/hudi/config/kafka-source.properties --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProviderspark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer --master yarn ./hudi-utilities-bundle_2.11-0.5.1-incubating.jar --table-type MERGE_ON_READ --source-class org.apache.hudi.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path cosn://[bucket]/usr/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props cosn://[bucket]/hudi/config/kafka-source.properties --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider --disable-compactionbin/run_sync_tool.sh --jdbc-url jdbc:hive2://[hiveserver2_ip:hiveserver2_port] --user hadoop --pass isd@cloud --partitioned-by dt --base-path cosn://[bucket]/usr/hive/warehouse/stock_ticks_cow --database default --table stock_ticks_cowbin/run_sync_tool.sh --jdbc-url jdbc:hive2://[hiveserver2_ip:hiveserver2_port] --user hadoop --pass hive --partitioned-by dt --base-path cosn://[bucket]/usr/hive/warehouse/stock_ticks_mor --database default --table stock_ticks_mor --skip-ro-suffixbeeline -u jdbc:hive2://[hiveserver2_ip:hiveserver2_port] -n hadoop --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=falsespark-sql --master yarn --conf spark.sql.hive.convertMetastoreParquet=falsehivesqls:select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG';select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';prestosqls:/usr/local/service/presto-client/presto --server localhost:9000 --catalog hive --schema default --user Hadoopselect symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG';select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';select "_hoodie_commit_time", symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';cli/bin/hudi-cli.shconnect --path cosn://[bucket]/usr/hive/warehouse/stock_ticks_morcompactions show allcompaction schedulecompaction run --compactionInstant [requestid] --parallelism 2 --sparkMemory 1G --schemaFilePath cosn://[bucket]/hudi/config/schema.avsc --retry 1