Skip to content

Commit 4814ef5

Browse files
authored
Merge pull request #1501 from jamescowens/integrated_scraper_2
Correct team file processing.
2 parents 7144227 + f32a5dc commit 4814ef5

File tree

1 file changed

+60
-40
lines changed

1 file changed

+60
-40
lines changed

src/scraper/scraper.cpp

Lines changed: 60 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ NN::QuorumHash ScraperGetSuperblockHash(NN::Superblock& superblock);
131131

132132
bool DownloadProjectHostFiles(const NN::WhitelistSnapshot& projectWhitelist);
133133
bool DownloadProjectTeamFiles(const NN::WhitelistSnapshot& projectWhitelist);
134-
bool ProcessProjectTeamFile(const fs::path& file, const std::string& etag, std::map<std::string, int64_t>& mTeamIdsForProject_out);
134+
bool ProcessProjectTeamFile(const std::string& project, const fs::path& file, const std::string& etag);
135135
bool DownloadProjectRacFilesByCPID(const NN::WhitelistSnapshot& projectWhitelist);
136136
bool ProcessProjectRacFileByCPID(const std::string& project, const fs::path& file, const std::string& etag, BeaconConsensus& Consensus);
137137
bool AuthenticationETagUpdate(const std::string& project, const std::string& etag);
@@ -1478,18 +1478,24 @@ bool DownloadProjectTeamFiles(const NN::WhitelistSnapshot& projectWhitelist)
14781478

14791479
std::string team_file_name;
14801480
fs::path team_file;
1481+
std::map<std::string, int64_t> mTeamIDsForProject;
1482+
bool bDownloadFlag = false;
14811483

14821484
if (fExplorer)
14831485
{
14841486
// Use eTag versioning.
14851487
team_file_name = prjs.m_name + "-" + sTeamETag + "-team.gz";
14861488
team_file = pathScraper / team_file_name;
14871489

1488-
// If the file with the same eTag already exists, don't download it again.
1490+
// If the file with the same eTag already exists, don't download it again. Leave bDownloadFlag false.
14891491
if (fs::exists(team_file))
14901492
{
14911493
_log(logattribute::INFO, "DownloadProjectTeamFiles", "Etag file for " + prjs.m_name + " already exists");
1492-
continue;
1494+
// continue;
1495+
}
1496+
else
1497+
{
1498+
bDownloadFlag = true;
14931499
}
14941500
}
14951501
else
@@ -1499,33 +1505,32 @@ bool DownloadProjectTeamFiles(const NN::WhitelistSnapshot& projectWhitelist)
14991505
team_file = pathScraper / team_file_name;
15001506

15011507
if (fs::exists(team_file)) fs::remove(team_file);
1502-
}
15031508

1504-
try
1505-
{
1506-
http.Download(prjs.StatsUrl("team"), team_file.string(), userpass);
1509+
bDownloadFlag = true;
15071510
}
1508-
catch(const std::runtime_error& e)
1509-
{
1510-
_log(logattribute::ERR, "DownloadProjectTeamFiles", "Failed to download project team file for " + prjs.m_name);
1511-
continue;
1512-
}
1513-
1514-
// If in explorer mode, save team xml files to file manifest map with exclude from CSManifest flag set to true.
1515-
if (fExplorer) AlignScraperFileManifestEntries(team_file, "team", prjs.m_name, true);
1516-
1517-
std::map<std::string, int64_t> mTeamIDsForProject;
15181511

1519-
// We need this check because we could be in explorer mode (if fExplorer is true).
1520-
if (REQUIRE_TEAM_WHITELIST_MEMBERSHIP)
1512+
// If in explorer mode and a new file is detected, or not in explorer mode, then download new file. (I.e. bDownload flag is true).
1513+
if (bDownloadFlag)
15211514
{
1522-
if (ProcessProjectTeamFile(team_file.string(), sTeamETag, mTeamIDsForProject))
1515+
try
1516+
{
1517+
http.Download(prjs.StatsUrl("team"), team_file.string(), userpass);
1518+
}
1519+
catch(const std::runtime_error& e)
15231520
{
1524-
// Insert or update team IDs for the project into the team ID map.
1525-
TeamIDMap[prjs.m_name] = mTeamIDsForProject;
1521+
_log(logattribute::ERR, "DownloadProjectTeamFiles", "Failed to download project team file for " + prjs.m_name);
1522+
continue;
15261523
}
15271524
}
15281525

1526+
// If in explorer mode and new file downloaded, save team xml files to file manifest map with exclude from CSManifest flag set to true.
1527+
// If not in explorer mode, this is not necessary, because the team xml file is just temporary and can be discarded after
1528+
// processing.
1529+
if (fExplorer && bDownloadFlag) AlignScraperFileManifestEntries(team_file, "team", prjs.m_name, true);
1530+
1531+
// If require team whitelist is set, then process the file. This also populates the whitelist.
1532+
if (REQUIRE_TEAM_WHITELIST_MEMBERSHIP) ProcessProjectTeamFile(prjs.m_name, team_file, sTeamETag);
1533+
15291534
if (fDebug3) _log(logattribute::INFO, "ENDLOCK", "cs_TeamIDMap");
15301535
}
15311536

@@ -1534,8 +1539,10 @@ bool DownloadProjectTeamFiles(const NN::WhitelistSnapshot& projectWhitelist)
15341539

15351540

15361541

1537-
bool ProcessProjectTeamFile(const fs::path& file, const std::string& etag, std::map<std::string, int64_t>& mTeamIdsForProject_out)
1542+
bool ProcessProjectTeamFile(const std::string& project, const fs::path& file, const std::string& etag)
15381543
{
1544+
std::map<std::string, int64_t> mTeamIdsForProject;
1545+
15391546
// If passed an empty file, immediately return false.
15401547
if (file.string().empty())
15411548
return false;
@@ -1544,25 +1551,19 @@ bool ProcessProjectTeamFile(const fs::path& file, const std::string& etag, std::
15441551

15451552
if (!ingzfile)
15461553
{
1547-
_log(logattribute::ERR, "ProcessProjectTeamFile", "Failed to open team gzip file (" + file.string() + ")");
1554+
_log(logattribute::ERR, "ProcessProjectTeamFile", "Failed to open team gzip file (" + file.filename().string() + ")");
15481555

15491556
return 0;
15501557
}
15511558

1552-
_log(logattribute::INFO, "ProcessProjectTeamFile", "Opening team file (" + file.string() + ")");
1559+
_log(logattribute::INFO, "ProcessProjectTeamFile", "Opening team file (" + file.filename().string() + ")");
15531560

15541561
boostio::filtering_istream in;
15551562

15561563
in.push(boostio::gzip_decompressor());
15571564
in.push(ingzfile);
15581565

1559-
// Insert csv in output filename. (This is not really used currently, but may be in the future.)
1560-
// std::string gzetagfile = std::string::replace(std::string::find(file.filename().string(), ".gz"), 3, ".csv.gz");
1561-
1562-
// Put path back in.
1563-
// gzetagfile = ((fs::path)(pathScraper / gzetagfile)).string();
1564-
1565-
_log(logattribute::INFO, "ProcessProjectTeamFile", "Started processing " + file.string());
1566+
_log(logattribute::INFO, "ProcessProjectTeamFile", "Started processing " + file.filename().string());
15661567

15671568
std::vector<std::string> vTeamWhiteList = split(TEAM_WHITELIST, "|");
15681569

@@ -1599,13 +1600,13 @@ bool ProcessProjectTeamFile(const fs::path& file, const std::string& etag, std::
15991600
continue;
16001601
}
16011602

1602-
mTeamIdsForProject_out[sTeamName] = nTeamID;
1603+
mTeamIdsForProject[sTeamName] = nTeamID;
16031604
}
16041605
else
16051606
builder.append(line);
16061607
}
16071608

1608-
if (mTeamIdsForProject_out.empty())
1609+
if (mTeamIdsForProject.empty())
16091610
{
16101611
_log(logattribute::CRITICAL, "ProcessProjectTeamFile", "Error in data processing of " + file.string());
16111612

@@ -1618,10 +1619,10 @@ bool ProcessProjectTeamFile(const fs::path& file, const std::string& etag, std::
16181619

16191620
ingzfile.close();
16201621

1621-
// If not explorer mode, delete input file after processing.
1622-
if (!fExplorer && fs::exists(file)) fs::remove(file);
1622+
// Insert or update team IDs for the project into the team ID map. This must be done before the StoreTeamIDList.
1623+
TeamIDMap[project] = mTeamIdsForProject;
16231624

1624-
if (mTeamIdsForProject_out.size() < vTeamWhiteList.size())
1625+
if (mTeamIdsForProject.size() < vTeamWhiteList.size())
16251626
_log(logattribute::ERR, "ProcessProjectTeamFile", "Unable to determine team IDs for one or more whitelisted teams.");
16261627

16271628
// The below is not an ideal implementation, because the entire map is going to be written out to disk each time.
@@ -1632,7 +1633,10 @@ bool ProcessProjectTeamFile(const fs::path& file, const std::string& etag, std::
16321633
else
16331634
_log(logattribute::INFO, "ProcessProjectTeamFile", "Stored Team ID entries.");
16341635

1635-
_log(logattribute::INFO, "ProcessProjectTeamFile", "Finished processing " + file.string());
1636+
// If not explorer mode, delete input file after processing.
1637+
if (!fExplorer && fs::exists(file)) fs::remove(file);
1638+
1639+
_log(logattribute::INFO, "ProcessProjectTeamFile", "Finished processing " + file.filename().string());
16361640

16371641
return true;
16381642
}
@@ -2100,20 +2104,34 @@ bool LoadTeamIDList(const fs::path& file)
21002104
in.push(ingzfile);
21012105

21022106
std::string line;
2107+
std::string separator;
21032108

21042109
// Header. This is used to construct the team names vector, since the team IDs were stored in the same order.
21052110
std::getline(in, line);
21062111

2112+
// This is to detect and handle the loading of a legacy existing TeamID.csv.gz file that contains commas rather than pipes.
2113+
// The file will be rewritten with pipe separators when the team files are processed.
2114+
if (line.find("|") != std::string::npos)
2115+
{
2116+
separator = "|";
2117+
}
2118+
else
2119+
{
2120+
_log(logattribute::INFO, "LoadTeamIDList", "Loading from legacy TeamID.csv.gz file with comma separator. This will be converted to pipe separator.");
2121+
2122+
separator = ",";
2123+
}
2124+
21072125
// This is in the form Project, Gridcoin, ...."
2108-
std::vector<std::string> vTeamNames = split(line, "|");
2126+
std::vector<std::string> vTeamNames = split(line, separator);
21092127
if (fDebug3) _log(logattribute::INFO, "LoadTeamIDList", "Size of vTeamNames = " + std::to_string(vTeamNames.size()));
21102128

21112129
while (std::getline(in, line))
21122130
{
21132131
std::string sProject = {};
21142132
std::map<std::string, int64_t> mTeamIDsForProject = {};
21152133

2116-
std::vector<std::string> vline = split(line, "|");
2134+
std::vector<std::string> vline = split(line, separator);
21172135

21182136
unsigned int iTeamName = 0;
21192137
// Populate team IDs into map.
@@ -2260,6 +2278,8 @@ bool StoreTeamIDList(const fs::path& file)
22602278

22612279
stream << std::endl;
22622280

2281+
if (fDebug3) _log(logattribute::INFO, "StoreTeamIDList", "TeamIDMap size = " + std::to_string(TeamIDMap.size()));
2282+
22632283
// Data
22642284
for (auto const& iProject : TeamIDMap)
22652285
{

0 commit comments

Comments
 (0)