Skip to content
This repository has been archived by the owner on Mar 3, 2022. It is now read-only.

[WIP] Add preliminary phash ETL #76

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions backend/facebook-ad-image-hashes/ChangeLog.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Revision history for facebook-ad-image-hashes

## 0.1.0.0 -- YYYY-mm-dd

* First version. Released on an unsuspecting world.
30 changes: 30 additions & 0 deletions backend/facebook-ad-image-hashes/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
Copyright (c) 2018, Greg Hale

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.

* Neither the name of Greg Hale nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2 changes: 2 additions & 0 deletions backend/facebook-ad-image-hashes/Setup.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
import Distribution.Simple
main = defaultMain
23 changes: 23 additions & 0 deletions backend/facebook-ad-image-hashes/default.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{ mkDerivation, aeson, base, blaze-html, bytestring, errors, http-client, http-client-tls, HUnit, kdt, lrucaching
, optparse-applicative, c-phash, hs-phash, pkgconfig, postgresql-simple, resourcet, stdenv
, stm, streaming, streaming-concurrency
, streaming-postgresql-simple, text, zeromq
}:
mkDerivation {
pname = "facebook-ad-image-hashes";
version = "0.1.0.0";
src = ./.;
buildTools = [ pkgconfig ];
isLibrary = true;
isExecutable = true;
libraryHaskellDepends = [
aeson base blaze-html bytestring errors HUnit kdt lrucaching optparse-applicative hs-phash http-client
http-client-tls postgresql-simple resourcet stm streaming streaming-concurrency
streaming-postgresql-simple text
];
executableHaskellDepends = [ base ];
license = stdenv.lib.licenses.bsd3;
libraryPkgconfigDepends = [ c-phash zeromq ];
testPkgconfigDepends = [ c-phash zeromq ];
executablePkgconfigDepends = [ c-phash zeromq ];
}
6 changes: 6 additions & 0 deletions backend/facebook-ad-image-hashes/exec/Main.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
module Main where

import qualified RunCli

main :: IO ()
main = RunCli.main
56 changes: 56 additions & 0 deletions backend/facebook-ad-image-hashes/facebook-ad-image-hashes.cabal
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
-- Initial facebook-ad-image-hashes.cabal generated by cabal init. For
-- further documentation, see http://haskell.org/cabal/users-guide/

name: facebook-ad-image-hashes
version: 0.1.0.0
-- synopsis:
-- description:
license: BSD3
license-file: LICENSE
author: Greg Hale
maintainer: [email protected]
-- copyright:
category: Web
build-type: Simple
extra-source-files: ChangeLog.md
cabal-version: >=1.10

library
exposed-modules: CliOptions
Queries
Report
RunCli
Search
build-depends: base >=4.9 && <4.11
, aeson
, blaze-html
, bytestring
, directory
, errors
, filepath
, http-client
, http-client-tls
, HUnit
, kdt
, lrucaching
, optparse-applicative
, phash
, postgresql-simple
, random
, resourcet
, stm
, streaming
, streaming-concurrency
, streaming-postgresql-simple
, mtl
, text
pkgconfig-depends: pHash, libzmq
hs-source-dirs: src
default-language: Haskell2010

executable hashes-cli
build-depends: base
, facebook-ad-image-hashes
hs-source-dirs: exec
main-is: Main.hs
default-language: Haskell2010
19 changes: 19 additions & 0 deletions backend/facebook-ad-image-hashes/shell.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
let myoverlay = self: super: {
haskellPackages = super.haskellPackages.override {
overrides = hself: hsuper:
let
dj = self.haskell.lib.doJailbreak;
dc = self.haskell.lib.dontCheck;
in
{
phash = null;
# hs-phash = dj (dc (hself.callPackage ../../../phash/default.nix {}));
# hs-phash = ((../../../phash/default.nix));
hs-phash = hself.callPackage ../../../phash/default.nix { c-phash = pkgs.phash; };
resourcet = dj (dc hsuper.resourcet_1_1_11);
postgresql-simple = dj (dc hsuper.postgresql-simple);
};
};
};
pkgs = import ../../../nixpkgs { overlays = [ myoverlay ]; };
in (pkgs.haskellPackages.callPackage ./. { c-phash = pkgs.phash; }).env
163 changes: 163 additions & 0 deletions backend/facebook-ad-image-hashes/src/CliOptions.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}

module CliOptions where

import Control.Monad
import Control.Exception
import Data.Maybe
import Data.List.NonEmpty (NonEmpty(..))
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import qualified Database.PostgreSQL.Simple as PG
import Options.Applicative
import System.Environment
import System.IO

import Search

getCommand :: IO Command
getCommand = execParser $ info (pCommand <**> helper)
( fullDesc
<> progDesc "Run commands for perceptual hashes against the ads database"
)

data Command
= CmdDbTest PG.ConnectInfo
| CmdResetHashes PG.ConnectInfo
| CmdPopulateHashes PG.ConnectInfo
| CmdSearch SearchOptions
deriving (Show)


pCommand :: Parser Command
pCommand =
(hsubparser $
command "test-db"
(info (CmdDbTest <$> dbConn)
(progDesc "Test connection to the ads database"))
<> command "reset-phashes"
(info (CmdResetHashes <$> dbConn)
(progDesc "Clear the phash column in the ads database"))
<> command "populate-phashes"
(info (CmdPopulateHashes <$> dbConn)
(progDesc "Compute phashes for images in the ads database"))
<> command "search"
(info (CmdSearch <$> searchParser)
(progDesc "Search for similar images"))
)


searchParser :: Parser SearchOptions
searchParser = SearchOptions
<$> some (
(fmap Left (strOption (long "filepath" <> help "Filepath to query"))
<|>
fmap (Right . URL) (strOption (long "url" <> help "URL to query"))
)
)
<*> searchTypeParser
<*> (fmap Just (option auto (long "cache-file"
<> help "Cache filepath")
)
<|> pure Nothing)
<*> (( flag' True (long "overwrite-cache") *>
((\db thr -> OverwriteCache db thr) <$>
dbConn <*>
fmap IdentityGroupingThreshold (option auto (long "threshold"))
)
) <|> pure UseCache)
<*> (fmap Just (strOption (long "out" <> help "Generate report in html (with .htm or .html suffix) or json")) <|> pure Nothing)


searchTypeParser :: Parser SearchType
searchTypeParser =
fmap SearchKNearest
(option auto (long "k-nearest" <> help "Get the k nearest results"))
<|> (SearchFirstInRanges
<$> option auto (long "range-bounds" <>
help "List of boundaries for concentric ring search")
<*> option auto (long "n-examples" <>
help "Number of examples per ring range")
)
<|> pure SearchNearest


dbConn :: Parser PG.ConnectInfo
dbConn =
PG.ConnectInfo
<$> strOption (long "dbhost"
<> short 'h'
<> help "Database Host"
<> value "localhost")
<*> option auto (long "dbport"
<> short 'p'
<> help "Database Port"
<> value 5432)
<*> strOption (long "dbuser"
<> short 'U'
<> help "Database User"
<> value "facebook_ads")
<*> strOption (long "dbpass"
<> short 'p'
<> help "Database Password"
<> value "password")
<*> strOption (long "dbname"
<> short 'd'
<> help "Database Name"
<> value "facebook_ads")


-- Extra Utilities for allowing CLI parser to sample env vars
-- and dotenv files
type Env = [(Text, Text)]

class FromText a where
fromText :: Text -> Either String a

instance FromText Text where
fromText = Right

environ :: (HasValue f, FromText a) => Text -> Env -> Mod f a
environ k env = maybe idm value . join $ parse <$> lookup k env
where
parse = either (const Nothing) Just . fromText

-- Read in all env vars and any vars from
-- an environment variable file
importEnv :: Maybe FilePath
-- ^ Path to an environment variable,
-- @Nothing@ will default to `~/.env`
-> IO Env
importEnv envVarFile = do
env <- getEnvironment
dotEnv <- case envVarFile of
Nothing ->
(readFile ".env") `catch` (\(e :: SomeException) -> return "")
Just fp ->
readFile fp
let fileEnv = getFileEnv dotEnv
env' = map (\(k,v) -> (T.pack k, T.pack v)) env
return (env' ++ fileEnv)
where

getFileEnv :: String -> Env
getFileEnv c = catMaybes . map splitPair . lines $ c

stripLeadingSpace = dropWhile (\c -> elem c [' ', '\t'])

-- Turn a line like
-- "HOST=0.0.0.0:8080 #The host to listen on"
-- into @Just ("HOST", "0.0.0.0:8080:")@
-- or
-- "#This is a comment"
-- into @Nothing@
splitPair :: String -> Maybe (Text, Text)
splitPair l = case takeWhile (/= '#') l of
"" -> Nothing
l' -> let (key, val) = break (== '=') l'
in if length key > 0
&& length (stripLeadingSpace val) > 0
then Just (T.pack key, T.pack (stripLeadingSpace val))
else Nothing
Loading