* Copyright 2012 Facebook, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* Run pull commands on local working copies to keep them up to date. This
* daemon handles all repository types.
* By default, the daemon pulls **every** repository. If you want it to be
* responsible for only some repositories, you can launch it with a list of
* PHIDs or callsigns:
* ./phd launch repositorypulllocal -- X Q Z
* You can also launch a daemon which is responsible for all //but// one or
* more repositories:
* ./phd launch repositorypulllocal -- --not A --not B
* If you have a very large number of repositories and some aren't being pulled
* as frequently as you'd like, you can either change the pull frequency of
* the less-important repositories to a larger number (so the daemon will skip
* them more often) or launch one daemon for all the less-important repositories
* and one for the more important repositories (or one for each more important
* repository).
* @task pull Pulling Repositories
* @task git Git Implementation
* @task hg Mercurial Implementation
final class PhabricatorRepositoryPullLocalDaemon
extends PhabricatorDaemon {
private static $commitCache = array();
/* -( Pulling Repositories )----------------------------------------------- */
* @task pull
public function run() {
$argv = $this->getArgv();
array_unshift($argv, __CLASS__);
$args = new PhutilArgumentParser($argv);
'name' => 'no-discovery',
'help' => 'Pull only, without discovering commits.',
'name' => 'not',
'param' => 'repository',
'repeat' => true,
'help' => 'Do not pull __repository__.',
'name' => 'repositories',
'wildcard' => true,
'help' => 'Pull specific __repositories__ instead of all.',
$no_discovery = $args->getArg('no-discovery');
$repo_names = $args->getArg('repositories');
$exclude_names = $args->getArg('not');
// Each repository has an individual pull frequency; after we pull it,
// wait that long to pull it again. When we start up, try to pull everything
// serially.
$retry_after = array();
$min_sleep = 15;
while (true) {
$repositories = $this->loadRepositories($repo_names);
if ($exclude_names) {
$exclude = $this->loadRepositories($exclude_names);
$repositories = array_diff_key($repositories, $exclude);
// Shuffle the repositories, then re-key the array since shuffle()
// discards keys. This is mostly for startup, we'll use soft priorities
// later.
$repositories = mpull($repositories, null, 'getID');
// If any repositories were deleted, remove them from the retry timer map
// so we don't end up with a retry timer that never gets updated and
// causes us to sleep for the minimum amount of time.
$retry_after = array_select_keys(
// Assign soft priorities to repositories based on how frequently they
// should pull again.
$repositories = array_select_keys(
array_keys($retry_after)) + $repositories;
foreach ($repositories as $id => $repository) {
$after = idx($retry_after, $id, 0);
if ($after > time()) {
$tracked = $repository->isTracked();
if (!$tracked) {
try {
if (!$no_discovery) {
// TODO: It would be nice to discover only if we pulled something,
// but this isn't totally trivial.
$sleep_for = $repository->getDetail('pull-frequency', $min_sleep);
$retry_after[$id] = time() + $sleep_for;
} catch (Exception $ex) {
$retry_after[$id] = time() + $min_sleep;
if ($retry_after) {
$sleep_until = max(min($retry_after), time() + $min_sleep);
} else {
$sleep_until = time() + $min_sleep;
$this->sleep($sleep_until - time());
* @task pull
protected function loadRepositories(array $names) {
if (!count($names)) {
return id(new PhabricatorRepository())->loadAll();
} else {
return PhabricatorRepository::loadAllByPHIDOrCallsign($names);
* @task pull
public static function pullRepository(PhabricatorRepository $repository) {
$vcs = $repository->getVersionControlSystem();
$is_svn = ($vcs == PhabricatorRepositoryType::REPOSITORY_TYPE_SVN);
$is_git = ($vcs == PhabricatorRepositoryType::REPOSITORY_TYPE_GIT);
$is_hg = ($vcs == PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL);
if ($is_svn) {
$callsign = $repository->getCallsign();
if (!$is_git && !$is_hg) {
throw new Exception(
"Unknown VCS '{$vcs}' for repository '{$callsign}'!");
$local_path = $repository->getDetail('local-path');
if (!$local_path) {
throw new Exception(
"No local path is available for repository '{$callsign}'.");
if (!Filesystem::pathExists($local_path)) {
$dirname = dirname($local_path);
if (!Filesystem::pathExists($dirname)) {
Filesystem::createDirectory($dirname, 0755, $recursive = true);
if ($is_git) {
return self::executeGitCreate($repository, $local_path);
} else if ($is_hg) {
return self::executeHgCreate($repository, $local_path);
} else {
if ($is_git) {
return self::executeGitUpdate($repository, $local_path);
} else if ($is_hg) {
return self::executeHgUpdate($repository, $local_path);
public static function discoverRepository(PhabricatorRepository $repository) {
$vcs = $repository->getVersionControlSystem();
switch ($vcs) {
case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT:
return self::executeGitDiscover($repository);
case PhabricatorRepositoryType::REPOSITORY_TYPE_SVN:
return self::executeSvnDiscover($repository);
case PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL:
return self::executeHgDiscover($repository);
throw new Exception("Unknown VCS '{$vcs}'!");
private static function isKnownCommit(
PhabricatorRepository $repository,
$target) {
if (self::getCache($repository, $target)) {
return true;
$commit = id(new PhabricatorRepositoryCommit())->loadOneWhere(
'repositoryID = %s AND commitIdentifier = %s',
if (!$commit) {
return false;
self::setCache($repository, $target);
while (count(self::$commitCache) > 2048) {
return true;
private static function isKnownCommitOnAnyAutocloseBranch(
PhabricatorRepository $repository,
$target) {
$commit = id(new PhabricatorRepositoryCommit())->loadOneWhere(
'repositoryID = %s AND commitIdentifier = %s',
if (!$commit) {
return false;
$data = $commit->loadCommitData();
if (!$data) {
return false;
if ($repository->shouldAutocloseCommit($commit, $data)) {
return true;
return false;
private static function recordCommit(
PhabricatorRepository $repository,
$epoch) {
$commit = new PhabricatorRepositoryCommit();
try {
$event = new PhabricatorTimelineEvent(
'id' => $commit->getID(),
self::insertTask($repository, $commit);
'INSERT INTO %T (repositoryID, size, lastCommitID, epoch)
VALUES (%d, 1, %d, %d)
size = size + 1,
lastCommitID =
IF(VALUES(epoch) > epoch, VALUES(lastCommitID), lastCommitID),
epoch = IF(VALUES(epoch) > epoch, VALUES(epoch), epoch)',
self::setCache($repository, $commit_identifier);
} catch (AphrontQueryDuplicateKeyException $ex) {
// Ignore. This can happen because we discover the same new commit
// more than once when looking at history, or because of races or
// data inconsistency or cosmic radiation; in any case, we're still
// in a good state if we ignore the failure.
self::setCache($repository, $commit_identifier);
private static function updateCommit(
PhabricatorRepository $repository,
$branch) {
$commit = id(new PhabricatorRepositoryCommit())->loadOneWhere(
'repositoryID = %s AND commitIdentifier = %s',
if (!$commit) {
// This can happen if the phabricator DB doesn't have the commit info,
// or the commit is so big that phabricator couldn't parse it. In this
// case we just ignore it.
$data = id(new PhabricatorRepositoryCommitData())->loadOneWhere(
'commitID = %d',
if (!$data) {
$data = new PhabricatorRepositoryCommitData();
$branches = $data->getCommitDetail('seenOnBranches', array());
$branches[] = $branch;
$data->setCommitDetail('seenOnBranches', $branches);
'only' => true
private static function insertTask(
PhabricatorRepository $repository,
PhabricatorRepositoryCommit $commit,
$data = array()) {
$vcs = $repository->getVersionControlSystem();
switch ($vcs) {
case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT:
$class = 'PhabricatorRepositoryGitCommitMessageParserWorker';
case PhabricatorRepositoryType::REPOSITORY_TYPE_SVN:
$class = 'PhabricatorRepositorySvnCommitMessageParserWorker';
case PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL:
$class = 'PhabricatorRepositoryMercurialCommitMessageParserWorker';
throw new Exception("Unknown repository type '{$vcs}'!");
$task = new PhabricatorWorkerTask();
$data['commitID'] = $commit->getID();
private static function setCache(
PhabricatorRepository $repository,
$commit_identifier) {
$key = self::getCacheKey($repository, $commit_identifier);
self::$commitCache[$key] = true;
private static function getCache(
PhabricatorRepository $repository,
$commit_identifier) {
$key = self::getCacheKey($repository, $commit_identifier);
return idx(self::$commitCache, $key, false);
private static function getCacheKey(
PhabricatorRepository $repository,
$commit_identifier) {
return $repository->getID().':'.$commit_identifier;
/* -( Git Implementation )------------------------------------------------- */
* @task git
private static function executeGitCreate(
PhabricatorRepository $repository,
$path) {
'clone --origin origin %s %s',
rtrim($path, '/'));
* @task git
private static function executeGitUpdate(
PhabricatorRepository $repository,
$path) {
// Run a bunch of sanity checks to detect people checking out repositories
// inside other repositories, making empty directories, pointing the local
// path at some random file or path, etc.
list($err, $stdout) = $repository->execLocalCommand(
'rev-parse --show-toplevel');
if ($err) {
// Try to raise a more tailored error message in the more common case
// of the user creating an empty directory. (We could try to remove it,
// but might not be able to, and it's much simpler to raise a good
// message than try to navigate those waters.)
if (is_dir($path)) {
$files = Filesystem::listDirectory($path, $include_hidden = true);
if (!$files) {
throw new Exception(
"Expected to find a git repository at '{$path}', but there ".
"is an empty directory there. Remove the directory: the daemon ".
"will run 'git clone' for you.");
throw new Exception(
"Expected to find a git repository at '{$path}', but there is ".
"a non-repository directory (with other stuff in it) there. Move or ".
"remove this directory (or reconfigure the repository to use a ".
"different directory), and then either clone a repository yourself ".
"or let the daemon do it.");
} else {
$repo_path = rtrim($stdout, "\n");
if (empty($repo_path)) {
throw new Exception(
"Expected to find a git repository at '{$path}', but ".
"there was no result from `git rev-parse --show-toplevel`. ".
"Something is misconfigured or broken. The git repository ".
"may be inside a '.git/' directory.");
if (!Filesystem::pathsAreEquivalent($repo_path, $path)) {
throw new Exception(
"Expected to find repo at '{$path}', but the actual ".
"git repository root for this directory is '{$repo_path}'. ".
"Something is misconfigured. The repository's 'Local Path' should ".
"be set to some place where the daemon can check out a working ".
"copy, and should not be inside another git repository.");
// This is a local command, but needs credentials.
$future = $repository->getRemoteCommandFuture('fetch --all --prune');
* @task git
private static function executeGitDiscover(
PhabricatorRepository $repository) {
list($remotes) = $repository->execxLocalCommand(
'remote show -n origin');
$matches = null;
if (!preg_match('/^\s*Fetch URL:\s*(.*?)\s*$/m', $remotes, $matches)) {
throw new Exception(
"Expected 'Fetch URL' in 'git remote show -n origin'.");
list($stdout) = $repository->execxLocalCommand(
'branch -r --verbose --no-abbrev');
$branches = DiffusionGitBranchQuery::parseGitRemoteBranchOutput(
$only_this_remote = DiffusionBranchInformation::DEFAULT_GIT_REMOTE);
$tracked_something = false;
foreach ($branches as $name => $commit) {
if (!$repository->shouldTrackBranch($name)) {
$tracked_something = true;
if (self::isKnownCommit($repository, $commit)) {
} else {
self::executeGitDiscoverCommit($repository, $commit);
if (!$tracked_something) {
$repo_name = $repository->getName();
$repo_callsign = $repository->getCallsign();
throw new Exception(
"Repository r{$repo_callsign} '{$repo_name}' has no tracked branches! ".
"Verify that your branch filtering settings are correct.");
foreach ($branches as $name => $commit) {
if (!$repository->shouldTrackBranch($name)) {
if (!$repository->shouldAutocloseBranch($name)) {
if (self::isKnownCommitOnAnyAutocloseBranch($repository, $commit)) {
self::executeGitDiscoverCommit($repository, $commit, $name);
* @task git
private static function executeGitDiscoverCommit(
PhabricatorRepository $repository,
$branch = null) {
$discover = array($commit);
$insert = array($commit);
$seen_parent = array();
while (true) {
$target = array_pop($discover);
list($parents) = $repository->execxLocalCommand(
'log -n1 --pretty="%%P" %s',
$parents = array_filter(explode(' ', trim($parents)));
foreach ($parents as $parent) {
if (isset($seen_parent[$parent])) {
// We end up in a loop here somehow when we parse Arcanist if we
// don't do this. TODO: Figure out why and draw a pretty diagram
// since it's not evident how parsing a DAG with this causes the
// loop to stop terminating.
$seen_parent[$parent] = true;
if ($branch !== null) {
$known = self::isKnownCommitOnAnyAutocloseBranch(
} else {
$known = self::isKnownCommit($repository, $parent);
if (!$known) {
$discover[] = $parent;
$insert[] = $parent;
if (empty($discover)) {
while (true) {
$target = array_pop($insert);
list($epoch) = $repository->execxLocalCommand(
'log -n1 --pretty="%%ct" %s',
$epoch = trim($epoch);
if ($branch !== null) {
self::updateCommit($repository, $target, $branch);
} else {
self::recordCommit($repository, $target, $epoch);
if (empty($insert)) {
* @task git
public static function executeGitVerifySameOrigin($remote, $expect, $where) {
$remote_uri = PhabricatorRepository::newPhutilURIFromGitURI($remote);
$expect_uri = PhabricatorRepository::newPhutilURIFromGitURI($expect);
$remote_path = $remote_uri->getPath();
$expect_path = $expect_uri->getPath();
$remote_match = self::executeGitNormalizePath($remote_path);
$expect_match = self::executeGitNormalizePath($expect_path);
if ($remote_match != $expect_match) {
throw new Exception(
"Working copy at '{$where}' has a mismatched origin URL. It has ".
"origin URL '{$remote}' (with remote path '{$remote_path}'), but the ".
"configured URL '{$expect}' (with remote path '{$expect_path}') is ".
"expected. Refusing to proceed because this may indicate that the ".
"working copy is actually some other repository.");
* @task git
private static function executeGitNormalizePath($path) {
// Strip away trailing "/" and ".git", so similar paths correctly match.
$path = rtrim($path, '/');
$path = preg_replace('/\.git$/', '', $path);
return $path;
/* -( Mercurial Implementation )------------------------------------------- */
* @task hg
private static function executeHgCreate(
PhabricatorRepository $repository,
$path) {
'clone %s %s',
rtrim($path, '/'));
* @task hg
private static function executeHgUpdate(
PhabricatorRepository $repository,
$path) {
// This is a local command, but needs credentials.
$future = $repository->getRemoteCommandFuture('pull -u');
try {
} catch (CommandException $ex) {
$err = $ex->getError();
$stdout = $ex->getStdOut();
// NOTE: Between versions 2.1 and 2.1.1, Mercurial changed the behavior
// of "hg pull" to return 1 in case of a successful pull with no changes.
// This behavior has been reverted, but users who updated between Feb 1,
// 2012 and Mar 1, 2012 will have the erroring version. Do a dumb test
// against stdout to check for this possibility.
// See:
// NOTE: Mercurial has translated versions, which translate this error
// string. In a translated version, the string will be something else,
// like "aucun changement trouve". There didn't seem to be an easy way
// to handle this (there are hard ways but this is not a common problem
// and only creates log spam, not application failures). Assume English.
// TODO: Remove this once we're far enough in the future that deployment
// of 2.1 is exceedingly rare?
if ($err == 1 && preg_match('/no changes found/', $stdout)) {
} else {
throw $ex;
private static function executeHgDiscover(PhabricatorRepository $repository) {
// NOTE: "--debug" gives us 40-character hashes.
list($stdout) = $repository->execxLocalCommand('--debug branches');
$branches = ArcanistMercurialParser::parseMercurialBranches($stdout);
$got_something = false;
foreach ($branches as $name => $branch) {
$commit = $branch['rev'];
if (self::isKnownCommit($repository, $commit)) {
} else {
self::executeHgDiscoverCommit($repository, $commit);
$got_something = true;
return $got_something;
private static function executeHgDiscoverCommit(
PhabricatorRepository $repository,
$commit) {
$discover = array($commit);
$insert = array($commit);
$seen_parent = array();
$stream = new PhabricatorMercurialGraphStream($repository);
// For all the new commits at the branch heads, walk backward until we
// find only commits we've aleady seen.
while ($discover) {
$target = array_pop($discover);
$parents = $stream->getParents($target);
foreach ($parents as $parent) {
if (isset($seen_parent[$parent])) {
$seen_parent[$parent] = true;
if (!self::isKnownCommit($repository, $parent)) {
$discover[] = $parent;
$insert[] = $parent;
foreach ($insert as $target) {
$epoch = $stream->getCommitDate($target);
self::recordCommit($repository, $target, $epoch);
/* -( Subversion Implementation )------------------------------------------ */
private static function executeSvnDiscover(
PhabricatorRepository $repository) {
$uri = self::executeSvnGetBaseSVNLogURI($repository);
list($xml) = $repository->execxRemoteCommand(
'log --xml --quiet --limit 1 %s@HEAD',
$results = self::executeSvnParseLogXML($xml);
$commit = head_key($results);
$epoch = head($results);
if (self::isKnownCommit($repository, $commit)) {
return false;
self::executeSvnDiscoverCommit($repository, $commit, $epoch);
return true;
private static function executeSvnDiscoverCommit(
PhabricatorRepository $repository,
$epoch) {
$uri = self::executeSvnGetBaseSVNLogURI($repository);
$discover = array(
$commit => $epoch,
$upper_bound = $commit;
$limit = 1;
while ($upper_bound > 1 &&
!self::isKnownCommit($repository, $upper_bound)) {
// Find all the unknown commits on this path. Note that we permit
// importing an SVN subdirectory rather than the entire repository, so
// commits may be nonsequential.
list($err, $xml, $stderr) = $repository->execRemoteCommand(
' log --xml --quiet --limit %d %s@%d',
$upper_bound - 1);
if ($err) {
if (preg_match('/(path|File) not found/', $stderr)) {
// We've gone all the way back through history and this path was not
// affected by earlier commits.
} else {
throw new Exception("svn log error #{$err}: {$stderr}");
$discover += self::executeSvnParseLogXML($xml);
$upper_bound = min(array_keys($discover));
// Discover 2, 4, 8, ... 256 logs at a time. This allows us to initially
// import large repositories fairly quickly, while pulling only as much
// data as we need in the common case (when we've already imported the
// repository and are just grabbing one commit at a time).
$limit = min($limit * 2, 256);
// NOTE: We do writes only after discovering all the commits so that we're
// never left in a state where we've missed commits -- if the discovery
// script terminates it can always resume and restore the import to a good
// state. This is also why we sort the discovered commits so we can do
// writes forward from the smallest one.
foreach ($discover as $commit => $epoch) {
self::recordCommit($repository, $commit, $epoch);
private static function executeSvnParseLogXML($xml) {
$xml = phutil_utf8ize($xml);
$result = array();
$log = new SimpleXMLElement($xml);
foreach ($log->logentry as $entry) {
$commit = (int)$entry['revision'];
$epoch = (int)strtotime((string)$entry->date[0]);
$result[$commit] = $epoch;
return $result;
private static function executeSvnGetBaseSVNLogURI(
PhabricatorRepository $repository) {
$uri = $repository->getDetail('remote-uri');
$subpath = $repository->getDetail('svn-subpath');
return $uri.$subpath;

