Skip to content

Commit 0152352

Browse files
authored
Merge pull request #5 from emakhov/master
Refactors mailing, run summarization. Adds S3 supports
2 parents 4271652 + 369a2d0 commit 0152352

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+2456
-1349
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Data Quality Framework
1+
# Agile Lab Data Quality
22

33
DQ is a framework to build parallel and distributed quality checks on big data environments.
44
It can be used to calculate metrics and perform checks to assure quality on structured or unstructured data.

build.sbt

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,25 @@
1-
import com.typesafe.sbt.packager.MappingsHelper.directory
21
import sbt.GlobFilter
32
import sbt.Keys.{logLevel, scalaVersion, test, updateOptions}
4-
import sbtassembly.AssemblyPlugin.autoImport.assemblyOption
5-
import src.main.scala.BuildEnvPlugin.autoImport.{BuildEnv, buildEnv}
6-
import src.main.scala.BuildIntegrationPlugin.autoImport.{IntegrationEnv, integrationEnv}
3+
import sbtassembly.AssemblyPlugin.autoImport.{assemblyExcludedJars, assemblyOption}
4+
import NativePackagerHelper._
75

86
name := "DataQuality-framework"
97

10-
lazy val commonSettings = Seq(version := "0.2.1")
8+
lazy val commonSettings = Seq(
9+
version := "1.1.0"
10+
)
1111

1212
scalacOptions ++= Seq(
1313
"-target:jvm-1.8",
1414
"-deprecation",
1515
"-feature",
1616
"-language:implicitConversions",
1717
"-language:postfixOps",
18-
"-language:reflectiveCalls",
19-
"-Xmax-classfile-name", "225"
20-
// "-Ypartial-unification"
18+
"-language:reflectiveCalls"
2119
)
2220

21+
scalacOptions ++= Seq("-Xmax-classfile-name", "225")
22+
2323
resolvers ++= Seq(
2424
Resolver.bintrayRepo("webjars","maven"),
2525
Resolver.sonatypeRepo("public"),
@@ -42,15 +42,13 @@ lazy val common = (project in file("dq-common"))
4242
lazy val core = (project in file("dq-core"))
4343
.enablePlugins(UniversalPlugin, UniversalDeployPlugin)
4444
.settings(
45-
// inThisBuild(
46-
// commonSettings ++ List(scalaVersion := "2.10.6")
47-
// ),
4845
scalaVersion := "2.10.6",
4946
commonSettings,
5047
libraryDependencies ++= Seq(
51-
"org.apache.spark" %% "spark-core" % "1.6.0",
52-
"org.apache.spark" %% "spark-sql" % "1.6.0",
53-
"org.apache.spark" %% "spark-hive" % "1.6.0",
48+
"org.apache.spark" %% "spark-core" % "1.6.0", //place % "provided" before deployment
49+
"org.apache.spark" %% "spark-sql" % "1.6.0", //place % "provided" before deployment
50+
"org.apache.spark" %% "spark-hive" % "1.6.0", //place % "provided" before deployment
51+
5452
"com.databricks" %% "spark-avro" % "2.0.1",
5553
"com.databricks" %% "spark-csv" % "1.5.0",
5654
"org.apache.commons" % "commons-lang3" % "3.0",
@@ -77,19 +75,38 @@ lazy val core = (project in file("dq-core"))
7775
assemblyExcludedJars in assembly := (fullClasspath in assembly).value.filter(_.data.getName startsWith "spark-assembly"),
7876
assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = true),
7977
test in assembly := {},
78+
assemblyMergeStrategy in assembly := {
79+
case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
80+
case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
81+
case PathList("org", "apache", xs @ _*) => MergeStrategy.last
82+
case PathList("com", "google", xs @ _*) => MergeStrategy.last
83+
case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
84+
case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
85+
case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
86+
case "about.html" => MergeStrategy.rename
87+
case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
88+
case "META-INF/mailcap" => MergeStrategy.last
89+
case "META-INF/mimetypes.default" => MergeStrategy.last
90+
case "plugin.properties" => MergeStrategy.last
91+
case "log4j.properties" => MergeStrategy.last
92+
case x =>
93+
val oldStrategy = (assemblyMergeStrategy in assembly).value
94+
oldStrategy(x)
95+
},
8096
mappings in Universal += {
8197
// TODO: Add paths application configuration files
8298
val confFile = buildEnv.value match {
83-
case BuildEnv.Dev => "path to application.conf"
84-
case BuildEnv.Test => "path to application.conf"
85-
case BuildEnv.Production => "path to application.conf"
99+
case BuildEnv.Stage => "conf/qa.conf"
100+
case BuildEnv.Test => "conf/test.conf"
101+
case BuildEnv.Production => "conf/prod.conf"
102+
case BuildEnv.Dev => "conf/dev.conf"
86103
}
87104
((resourceDirectory in Compile).value / confFile) -> "conf/application.conf"
88105
},
89106
mappings in Universal ++= {
90107
// TODO: Add paths application integration files
91108
val integrationFolder = integrationEnv.value match {
92-
case IntegrationEnv.local => "path to integration directory"
109+
case _ => "integration/dev"
93110
}
94111
directory((resourceDirectory in Compile).value / integrationFolder / "bin") ++
95112
directory((resourceDirectory in Compile).value / integrationFolder / "conf")
@@ -169,9 +186,9 @@ lazy val ui = (project in file("dq-ui"))
169186

170187
// use the combined tslint and eslint rules plus ng2 lint rules
171188
(rulesDirectories in tslint) := Some(List(
172-
tslintEslintRulesDir.value,
173-
// codelyzer uses 'cssauron' which can't resolve 'through' see https://github.com/chrisdickinson/cssauron/pull/10
174-
ng2LintRulesDir.value
189+
tslintEslintRulesDir.value,
190+
// codelyzer uses 'cssauron' which can't resolve 'through' see https://github.com/chrisdickinson/cssauron/pull/10
191+
ng2LintRulesDir.value
175192
)),
176193

177194
// the naming conventions of our test files

docs/examples/conf/full-prostprocess-example.conf

Lines changed: 98 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,28 @@ Sources: [
22
{
33
id = "GOT_B"
44
type = "HDFS"
5-
path = "./Agile.DataQuality/side-code/example-data/battles.csv"
5+
path = "./docs/examples/data/battles.csv"
66
delimiter = ","
77
header = true
88
fileType = "csv"
9-
keyFields = ["name","year","attacker_king","defender_king"]
9+
keyFields = ["name","year","defender_king"]
1010
},
1111
{
1212
id = "GOT_D"
1313
type = "HDFS"
14-
path = "./Agile.DataQuality/side-code/example-data/character-deaths.csv"
14+
path = "./docs/examples/data/character-deaths.csv"
1515
delimiter = ","
1616
header = true
1717
fileType = "csv"
18-
}
18+
},
19+
{
20+
id = "customer"
21+
type = "HDFS"
22+
path = "./docs/examples/data/customer.csv"
23+
delimiter = "|"
24+
header = false
25+
fileType = "csv"
26+
},
1927
]
2028

2129
VirtualSources: [
@@ -43,6 +51,39 @@ VirtualSources: [
4351
},
4452
]
4553

54+
LoadChecks: [
55+
{
56+
id = "customer_encoding_check"
57+
type = "ENCODING"
58+
source = "customer"
59+
option = "UTF-8"
60+
},
61+
{
62+
id = "customer_exact_column"
63+
type = "EXACT_COLUMN_NUM"
64+
source = "customer"
65+
option = 1
66+
},
67+
{
68+
id = "customer_min_column"
69+
type = "MIN_COLUMN_NUM"
70+
source = "customer"
71+
option = 2
72+
},
73+
{
74+
id = "customer_file_type"
75+
type = "FILE_TYPE"
76+
source = "customer"
77+
option = "avro"
78+
},
79+
{
80+
id = "customer_file_existence"
81+
type = "EXIST"
82+
source = "customer"
83+
option = true
84+
}
85+
]
86+
4687
Metrics: [
4788
{
4889
id: "row_count"
@@ -53,6 +94,36 @@ Metrics: [
5394
file: "GOT_B"
5495
}
5596
},
97+
{
98+
id: "customer_row_count"
99+
name: "ROW_COUNT"
100+
type: "FILE"
101+
description: "rowcount"
102+
config: {
103+
file: "customer"
104+
}
105+
},
106+
{
107+
id: "null_values"
108+
name: "NULL_VALUES"
109+
type: "COLUMN"
110+
description: "null values in column attacker_size"
111+
config: {
112+
file: "customer",
113+
columns: ["attacker_size"],
114+
positions: [1]
115+
}
116+
},
117+
{
118+
id: "null_values_col"
119+
name: "NULL_VALUES"
120+
type: "COLUMN"
121+
description: "null values in column attacker_size"
122+
config: {
123+
file: "customer",
124+
columns: ["C0"]
125+
}
126+
},
56127
{
57128
id: "average"
58129
name: "AVG_NUMBER"
@@ -232,41 +303,45 @@ Checks: [
232303

233304
Targets: [
234305
{
235-
type: "CHECKS"
306+
type: "FILE_METRICS"
236307
config: {
237308
fileFormat: "csv"
238-
path: "./Agile.DataQuality/side-code/dump"
309+
path: "./tmp/results"
239310
delimiter: ","
240-
savemode: "append"
241311
}
242312
},
243313
{
244-
type: "COLUMNAR-METRICS"
314+
type: "COLUMN_METRICS"
245315
config: {
246316
fileFormat: "csv"
247-
path: "./Agile.DataQuality/side-code/dump"
317+
path: "./tmp/results"
248318
delimiter: ","
249-
savemode: "append"
250319
}
251320
},
252321
{
253-
type: "FILE-METRICS"
322+
type: "COMPOSED_METRICS"
254323
config: {
255324
fileFormat: "csv"
256-
path: "./Agile.DataQuality/side-code/dump"
325+
path: "./tmp/results"
257326
delimiter: ","
258-
savemode: "append"
259327
}
260328
},
261329
{
262-
type: "COMPOSED-METRICS"
330+
type: "CHECKS"
263331
config: {
264332
fileFormat: "csv"
265-
path: "./Agile.DataQuality/side-code/dump"
333+
path: "./tmp/results"
266334
delimiter: ","
267-
savemode: "append"
268335
}
269336
},
337+
{
338+
type: "LOAD_CHECKS"
339+
config: {
340+
fileFormat: "csv"
341+
path: "./tmp/results"
342+
delimiter: ","
343+
}
344+
}
270345
]
271346

272347
Postprocessing: [
@@ -286,7 +361,7 @@ Postprocessing: [
286361
saveTo: {
287362
fileName: "tera_enriched"
288363
fileFormat: "csv"
289-
path: "./Agile.DataQuality/side-code/dump/postproc"
364+
path: "./tmp/postproc"
290365
delimiter: ","
291366
}
292367
}
@@ -299,7 +374,7 @@ Postprocessing: [
299374
saveTo: {
300375
fileName: "tera_transposed"
301376
fileFormat: "csv"
302-
path: "./Agile.DataQuality/side-code/dump/postproc"
377+
path: "./tmp/postproc"
303378
delimiter: ","
304379
quoted: true
305380
}
@@ -313,7 +388,7 @@ Postprocessing: [
313388
saveTo: {
314389
fileName: "tera_headless"
315390
fileFormat: "csv"
316-
path: "./Agile.DataQuality/side-code/dump/postproc"
391+
path: "./tmp/postproc"
317392
delimiter: ","
318393
}
319394
}
@@ -332,7 +407,7 @@ Postprocessing: [
332407
saveTo: {
333408
fileName: "tera_empty"
334409
fileFormat: "csv"
335-
path: "./Agile.DataQuality/side-code/dump/postproc"
410+
path: "./tmp/postproc"
336411
delimiter: ","
337412
}
338413
}
@@ -345,7 +420,7 @@ Postprocessing: [
345420
saveTo: {
346421
fileName: "empty_headless"
347422
fileFormat: "csv"
348-
path: "./Agile.DataQuality/side-code/dump/postproc"
423+
path: "./tmp/postproc"
349424
delimiter: ","
350425
}
351426
}
@@ -359,7 +434,7 @@ Postprocessing: [
359434
saveTo: {
360435
fileName: "empty_headless_keyed"
361436
fileFormat: "csv"
362-
path: "./Agile.DataQuality/side-code/dump/postproc"
437+
path: "./tmp/postproc"
363438
delimiter: ","
364439
}
365440
}
@@ -372,7 +447,7 @@ Postprocessing: [
372447
saveTo: {
373448
fileName: "tera_arranged"
374449
fileFormat: "csv"
375-
path: "./Agile.DataQuality/side-code/dump/postproc"
450+
path: "./tmp/postproc"
376451
delimiter: ","
377452
}
378453
}

docs/examples/data/customer.csv

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@ id|name
22
|
33
null|null
44
NULL|NULL
5-
nil|nil
5+
nil|nil|toast
66
1|pew
77
0|2
88
30|Paolo
9-
2|Rocco
9+
2|Rocco
10+
test
11+
1312

0 commit comments

Comments
 (0)