Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 87 additions & 48 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,90 +1,129 @@
# Created by https://www.gitignore.io/api/intellij,scala,sbt

### Intellij ###
# IntelliJ IDEA
.idea

# Spark tmp dir
tmp

#Docker Data
docker/data

### macOS template
# General
*.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff:
.idea/workspace.xml
.idea/tasks.xml
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/dictionaries

# Sensitive or high-churn files:
.idea/dataSources/
.idea/dataSources.ids
.idea/dataSources.xml
.idea/dataSources.local.xml
.idea/sqlDataSources.xml
.idea/dynamic.xml
.idea/uiDesigner.xml
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.xml
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml

# Gradle:
.idea/gradle.xml
.idea/libraries
.idea/**/gradle.xml
.idea/**/libraries

# CMake
cmake-build-debug/

# Mongo Explorer plugin:
.idea/mongoSettings.xml
.idea/**/mongoSettings.xml

## File-based project format:
*.iws

## Plugin-specific files:

# IntelliJ
/out/
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
### Java template
# Compiled class file
*.class

### Intellij Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
# Log file
*.log

# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr
# BlueJ files
*.ctxt

# Mobile Tools for Java (J2ME)
.mtj.tmp/

### Scala ###
*.class
*.log
# Package Files #
*.jar
*.war
*.ear
*.zip
*.tar.gz
*.rar

# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
### Scala template
### SBT template
# Simple Build Tool
# http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control

# sbt specific
.cache
.history
.lib/
dist/*
target/
lib_managed/
src_managed/
project/boot/
project/plugins/project/
project/target/*
project/project/target/

# Scala-IDE specific
.scala_dependencies
.worksheet

# ENSIME specific
.ensime_cache/
.ensime


### SBT ###
# Simple Build Tool
# http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control

.history
.cache
.lib/
sbt-cache
repositories

# End of https://www.gitignore.io/api/intellij,scala,sbt
.idea/

tmp/*
# lab configuration file
.lab.cache
13 changes: 2 additions & 11 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,11 @@ stages:
- test
- assembly

test-1.6:
stage: test
script:
- sbt -Dfile.encoding=UTF-8 "project core" 'set sparkVersion:="1.6.0"' 'testOnly * -- -n it.agilelab.bigdata.DataQuality.Spark1xTest'

test-2.4:
stage: test
script:
- sbt -Dfile.encoding=UTF-8 "project core" 'set sparkVersion:="2.4.0"' 'testOnly * -- -n it.agilelab.bigdata.DataQuality.Spark2xTest'

assembly-1.6:
stage: assembly
script:
- sbt "project core" 'set sparkVersion:="1.6.0"' assembly
- ./scripts/test-only-spark2.sh
coverage: '/Statement coverage.: (.*\%)/'

assembly-2.4:
stage: assembly
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ You can improve it by sending pull requests to this repository.
## Installation

Data Quality is currently using following dependencies:
- Scala 2.10(core)/2.11(ui)
- Apache Spark 1.6
- Scala 2.11.12
- Apache Spark 2.4.0
- PostgreSQL 9.3 (works also with Oracle and SQLite)

To be able to use all the features of Data Quality you'll need to setup a database
Expand All @@ -78,12 +78,12 @@ All modules of DQ works independently.:

## Building CORE module

Data Quality core module can be built with 2 different versions of Spark (1.6.0, 2.2.0). By default if will select 1.6.0. In order to build with Spark 2 set Multiversion.sparkVersion as in following snippet:
From the sbt console opened in the root project issue the following commands:
```
- set Multiversion.sparkVersion := "2.2.0"
- project core
- assembly
```
This should generate the core artifact in `<your-project-dir>/dq-core/target/scala-<scala-version>/dq-core_<spark-major.minor>_<scala-major.minor.patch>-<dq-version>.jar`

## Examples

Expand Down
5 changes: 2 additions & 3 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import com.typesafe.sbt.SbtNativePackager.autoImport.NativePackagerHelper._
import sbt.Keys.scalaVersion

ThisBuild / organization := "it.agilelab"
ThisBuild / version := "1.3.0-SNAPSHOT"
ThisBuild / version := "1.3.2"

scalacOptions ++= Seq(
"-target:jvm-1.8",
Expand Down Expand Up @@ -80,7 +80,6 @@ lazy val core = (project in file("dq-core"))
}
((resourceDirectory in Compile).value / confFile) -> "conf/application.conf"
},

Universal / mappings ++= {
val integrationFolder = integrationEnv.value match {
case _ => "integration/dev"
Expand Down Expand Up @@ -134,4 +133,4 @@ lazy val be = (project in file("dq-be"))
libraryDependencies ++= {
Seq(jdbc, cache, ws, specs2 % Test, evolutions, guice) ++ Dependencies.dq_be
}
).dependsOn(api,common)
).dependsOn(api,common)
2 changes: 2 additions & 0 deletions docs/examples/data/usgs/output/CHECKS-1.6.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
checkId,checkName,description,checkedFile,baseMetric,comparedMetric,comparedThreshold,status,message,execDate
depth_avg_check,GREATER_THAN,Checks is average of depth is greather than 10,USGS_2000,depth_avg,,50.0,Success,Check depth_avg_check for metric AVG_NUMBER on column USGS_2000[Buffer(Depth)] check if (MetricResult) 85.05389330922242 is GREATER_THAN 50.0 (compareMetric/threshold). Result: Success. CheckStatus: 85.05389330922242 > 50.0.,2019-01-01
2 changes: 2 additions & 0 deletions docs/examples/data/usgs/output/CHECKS-2.4.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
checkId,checkName,description,checkedFile,baseMetric,comparedMetric,comparedThreshold,status,message,execDate
depth_avg_check,GREATER_THAN,Checks is average of depth is greather than 10,USGS_2000,depth_avg,"",50.0,Success,Check depth_avg_check for metric AVG_NUMBER on column USGS_2000[Buffer(Depth)] check if (MetricResult) 85.05389330922242 is GREATER_THAN 50.0 (compareMetric/threshold). Result: Success. CheckStatus: 85.05389330922242 > 50.0.,2019-01-01
4 changes: 3 additions & 1 deletion docs/installation/ui-setup.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
####DataQuality - UI
The UI to create configuration for Data Quality framework

## Quick Start

##Quick Start

###Prerequisites

DataQuality UI has some requirements:
Expand Down
22 changes: 22 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,16 +137,38 @@ Returns sum of all numerical values in the column

Parameters: none

##### "SUM_DECIMAL_NUMBER"
Returns sum of all numerical values in the column

In financial use case, this metric is recommended over SUM_NUMBER

Parameters: none

##### "AVG_NUMBER"
Returns average of all numerical value inside of the column

Parameters: none

##### "AVG_DECIMAL_NUMBER"
Returns average of all numerical value inside of the column

In financial use case, this metric is recommended over AVG_NUMBER

Parameters: none

##### "STD_NUMBER"
Return standard deviation of all numerical values inside of the column

Parameters: none

##### "STD_DECIMAL_NUMBER"

Return standard deviation of all numerical values inside of the column

In financial use case, this metric is recommended over STD_NUMBER

Parameters: none

##### "MIN_STRING"
Return string minimum of the column (mostly for comparing dates)

Expand Down
65 changes: 65 additions & 0 deletions dq-core/src/main/resources/conf/gitlab-ci-v1.0.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
data_quality {

application_name: "local"
run_configuration_version: 1.0
hive_warehouse_path: ""
hbase_host: ""

tmp_files_management: {
local_fs_path: "/tmp/fs"
hdfs_path: "/tmp/hdfs"
}

metric_error_management: {
dump_directory_path: "/tmp/dump"
dump_size: 1000 // max number of collected errors for 1 metric for 1 partition
empty_file: true
file_config: {
format: "csv"
delimiter: ","
quote: "\""
escape: "\\"
quote_mode: "ALL"
}
}

virtual_sources_management: {
dump_directory_path: "/tmp/virtual"
file_format: "csv"
delimiter: ","
}

// Result storage configuration
// Supported types: "DB", "NONE"
// Use "" to turn off storage feature
// "DB" subtypes: "SQLITE", "POSTGRES", "ORACLE
storage:{
type: "NONE"
config: {
subtype: "POSTGRES"
host: "localhost:5433/dataquality"
user: "postgres"
password: "postgres"
schema: "dev"
}
}

// Check failure alert mailer configuration
mailing {
// "external" - to use external SMTP server
// "internal" - to use internal SMTP thru bash script (check universal/bin/sendMail.sh for extra configuration)
// "" - to turn off mailing
mode: "internal"
mail_script_path: ""
// config: {
// address: "[email protected]"
// hostname: "smtp.gmail.com"
// username: "test.testovic"
// password: "password123"
// smtpPort: 465
// sslOnConnect: true
// }

notifications: false
}
}
Loading