Precompute map analysis (#433)

* add time metrics * progress * Revert "Revert "Use allShortestPaths over shortestPath (#431)"" This reverts commit 167828c. * map analysis through redis * new reqs * add worker actually * abort if background job fails * lint * minor cleanup * move hash to utils and make db method write to redis optionally * make ga job results shared among clients * lint
OWASP · Oct 23, 2023 · f508bca · f508bca
1 parent 167828c
commit f508bca
Show file tree

Hide file tree

Showing 15 changed files with 338 additions and 168 deletions.
diff --git a/Makefile b/Makefile
@@ -1,23 +1,38 @@
 
 .ONESHELL:
 
-.PHONY: dev-run run test covers install-deps dev docker lint frontend clean all
+.PHONY: run test covers install-deps dev docker lint frontend clean all
 
 prod-run:
 	cp cres/db.sqlite standards_cache.sqlite; gunicorn cre:app --log-file=-
 
-dev-run:
-	. ./venv/bin/activate && FLASK_APP=cre.py FLASK_CONFIG=development flask run
+docker-neo4j:
+	docker start cre-neo4j 2>/dev/null   || docker run -d --name cre-neo4j --env NEO4J_PLUGINS='["apoc"]'  --env NEO4J_AUTH=neo4j/password --volume=`pwd`/.neo4j/data:/data --volume=`pwd`/.neo4j/logs:/logs --workdir=/var/lib/neo4j -p 7474:7474 -p 7687:7687 neo4j
+
+docker-redis:
+	docker start redis-stack 2>/dev/null || docker run -d --name redis-stack -p 6379:6379 -p 8001:8001 redis/redis-stack:latest
+
+start-containers: docker-neo4j docker-redis
+
+start-worker:
+	. ./venv/bin/activate
+	FLASK_APP=`pwd`/cre.py python cre.py --start_worker
+
+dev-flask:
+	. ./venv/bin/activate
+	FLASK_APP=`pwd`/cre.py  FLASK_CONFIG=development flask run
+
 e2e:
 	yarn build
 	[ -d "./venv" ] && . ./venv/bin/activate
 	export FLASK_APP=$(CURDIR)/cre.py
 	export FLASK_CONFIG=development
-	FLASK_CONFIG=development flask run&
-	
+	flask run&
+
 	yarn test:e2e
 	killall yarn
 	killall flask
+
 test:
 	[ -d "./venv" ] && . ./venv/bin/activate
 	export FLASK_APP=$(CURDIR)/cre.py
@@ -45,9 +60,6 @@ docker:
 docker-run:
 	 docker run -it -p 5000:5000 opencre:$(shell git rev-parse HEAD)
 
-docker-neo4j:
-	docker run  --env NEO4J_PLUGINS='["apoc"]'  --volume=./neo4j/data:/data --volume=/data --volume=/logs --workdir=/var/lib/neo4j -p 7474:7474 -p 7687:7687 -d neo4j
-
 lint:
 	[ -d "./venv" ] && . ./venv/bin/activate && black . && yarn lint
 
@@ -82,4 +94,8 @@ import-all:
 	[ -d "./venv" ] && . ./venv/bin/activate
 	rm -rf standards_cache.sqlite && make migrate-upgrade && export FLASK_APP=$(CURDIR)/cre.py && python cre.py --add --from_spreadsheet https://docs.google.com/spreadsheets/d/1eZOEYgts7d_-Dr-1oAbogPfzBLh6511b58pX3b59kvg && python cre.py --generate_embeddings && python cre.py --zap_in --cheatsheets_in --github_tools_in  --capec_in --owasp_secure_headers_in --pci_dss_4_in --juiceshop_in &&	python cre.py --generate_embeddings
 
+import-neo4j:
+	[ -d "./venv" ] && . ./venv/bin/activate
+	export FLASK_APP=$(CURDIR)/cre.py && python cre.py --populate_neo4j_db
+
 all: clean lint test dev dev-run
diff --git a/Procfile b/Procfile
@@ -1 +1,2 @@
-web: gunicorn cre:app --log-file=-
+web: gunicorn cre:app --log-file=-g
+worker: FLASK_APP=`pwd`/cre.py python cre.py --start_worker
diff --git a/README.md b/README.md
@@ -60,12 +60,18 @@ To add a remote spreadsheet to your local database you can run
 <pre>python cre.py --add --from_spreadsheet < google sheets url></pre>
 
 To run the web application for development you can run
-<pre>make dev-run</pre>
+<pre>
+$ make start-containers
+$ make start-worker 
+
+# in a seperate shell
+$ make dev-flask
+</pre>
 
 Alternatively, you can use the dockerfile with
 <pre>make docker && make docker-run</pre>
 
-Some features like Gap Analysis require a neo4j DB running you can start this with
+Some features like Gap Analysis require a neo4j DB running, you can start this with
 <pre>make docker-neo4j</pre>
 enviroment varaibles for app to connect to neo4jDB (default):
 - NEO4J_URL (neo4j//neo4j:password@localhost:7687)

diff --git a/application/cmd/cre_main.py b/application/cmd/cre_main.py
@@ -17,7 +17,6 @@
 from application.utils.external_project_parsers import (
     capec_parser,
     cwe,
-    ccmv3,
     ccmv4,
     cheatsheets_parser,
     misc_tools_parser,
@@ -375,14 +374,6 @@ def run(args: argparse.Namespace) -> None:  # pragma: no cover
     if args.export:
         cache = db_connect(args.cache_file)
         cache.export(args.export)
-    if args.csa_ccm_v3_in:
-        ccmv3.parse_ccm(
-            ccmFile=sheet_utils.readSpreadsheet(
-                alias="",
-                url="https://docs.google.com/spreadsheets/d/1b5i8OV919aiqW2KcYWOQvkLorL1bRPqjthJxLH0QpD8",
-            ),
-            cache=db_connect(args.cache_file),
-        )
     if args.csa_ccm_v4_in:
         ccmv4.parse_ccm(
             ccmFile=sheet_utils.readSpreadsheet(
@@ -428,6 +419,10 @@ def run(args: argparse.Namespace) -> None:  # pragma: no cover
         owasp_metadata_to_cre(args.owasp_proj_meta)
     if args.populate_neo4j_db:
         populate_neo4j_db(args.cache_file)
+    if args.start_worker:
+        from application.worker import start_worker
+
+        start_worker(args.cache_file)
 
 
 def db_connect(path: str):

diff --git a/application/database/db.py b/application/database/db.py
@@ -1,3 +1,6 @@
+from flask import json as flask_json
+import json
+import redis
 from neomodel import (
     config,
     StructuredNode,
@@ -16,7 +19,6 @@
 from collections import Counter
 from itertools import permutations
 from typing import Any, Dict, List, Optional, Tuple, cast
-
 import networkx as nx
 import yaml
 from application.defs import cre_defs
@@ -26,6 +28,8 @@
 import uuid
 
 from application.utils.gap_analysis import get_path_score
+from application.utils.hash import make_array_hash
+
 
 from .. import sqla  # type: ignore
 
@@ -294,8 +298,7 @@ def __init__(sel):
         raise ValueError("NEO_DB is a singleton, please call instance() instead")
 
     @classmethod
-    def populate_DB(self, session) -> nx.Graph:
-        graph = nx.DiGraph()
+    def populate_DB(self, session):
         for il in session.query(InternalLinks).all():
             group = session.query(CRE).filter(CRE.id == il.group).first()
             if not group:
@@ -319,7 +322,6 @@ def populate_DB(self, session) -> nx.Graph:
             self.add_cre(cre)
 
             self.link_CRE_to_Node(lnk.cre, lnk.node, lnk.type)
-        return graph
 
     @classmethod
     def add_cre(self, dbcre: CRE):
@@ -423,31 +425,38 @@ def link_CRE_to_Node(self, CRE_id, node_id, link_type):
     def gap_analysis(self, name_1, name_2):
         base_standard = NeoStandard.nodes.filter(name=name_1)
         denylist = ["Cross-cutting concerns"]
+        from pprint import pprint
+        from datetime import datetime
+
+        t1 = datetime.now()
         path_records_all, _ = db.cypher_query(
             """
             OPTIONAL MATCH (BaseStandard:NeoStandard {name: $name1})
             OPTIONAL MATCH (CompareStandard:NeoStandard {name: $name2})
-            OPTIONAL MATCH p = shortestPath((BaseStandard)-[*..20]-(CompareStandard)) 
+            OPTIONAL MATCH p = allShortestPaths((BaseStandard)-[*..20]-(CompareStandard)) 
             WITH p
             WHERE length(p) > 1 AND ALL(n in NODES(p) WHERE (n:NeoCRE or n = BaseStandard or n = CompareStandard) AND NOT n.name in $denylist) 
             RETURN p
             """,
             {"name1": name_1, "name2": name_2, "denylist": denylist},
             resolve_objects=True,
         )
-
+        t2 = datetime.now()
+        pprint(f"path records all took {t2-t1}")
+        pprint(path_records_all.__len__())
         path_records, _ = db.cypher_query(
             """
             OPTIONAL MATCH (BaseStandard:NeoStandard {name: $name1})
             OPTIONAL MATCH (CompareStandard:NeoStandard {name: $name2})
-            OPTIONAL MATCH p = shortestPath((BaseStandard)-[:(LINKED_TO|CONTAINS)*..20]-(CompareStandard)) 
+            OPTIONAL MATCH p = allShortestPaths((BaseStandard)-[:(LINKED_TO|CONTAINS)*..20]-(CompareStandard)) 
             WITH p
             WHERE length(p) > 1 AND ALL(n in NODES(p) WHERE (n:NeoCRE or n = BaseStandard or n = CompareStandard) AND NOT n.name in $denylist) 
             RETURN p
             """,
             {"name1": name_1, "name2": name_2, "denylist": denylist},
             resolve_objects=True,
         )
+        t3 = datetime.now()
 
         def format_segment(seg: StructuredRel, nodes):
             relation_map = {
@@ -476,16 +485,24 @@ def format_path_record(rec):
                 "path": [format_segment(seg, rec.nodes) for seg in rec.relationships],
             }
 
+        pprint(
+            f"path records all took {t2-t1} path records took {t3 - t2}, total: {t3 - t1}"
+        )
         return [NEO_DB.parse_node(rec) for rec in base_standard], [
             format_path_record(rec[0]) for rec in (path_records + path_records_all)
         ]
 
     @classmethod
     def standards(self) -> List[str]:
-        tools = NeoTool.nodes.all()
-        standards = NeoStandard.nodes.all()
-
-        return list(set([x.name for x in tools] + [x.name for x in standards]))
+        tools = []
+        for x in db.cypher_query("""MATCH (n:NeoTool) RETURN DISTINCT n.name""")[0]:
+            tools.extend(x)
+        standards = []
+        for x in db.cypher_query("""MATCH (n:NeoStandard) RETURN DISTINCT n.name""")[
+            0
+        ]:  # 0 is the results, 1 is the "n.name" param
+            standards.extend(x)
+        return list(set([x for x in tools] + [x for x in standards]))
 
     @staticmethod
     def parse_node(node: NeoDocument) -> cre_defs.Document:
@@ -1399,28 +1416,6 @@ def find_path_between_nodes(
 
         return res
 
-    def gap_analysis(self, node_names: List[str]):
-        base_standard, paths = self.neo_db.gap_analysis(node_names[0], node_names[1])
-        if base_standard is None:
-            return None
-        grouped_paths = {}
-        for node in base_standard:
-            key = node.id
-            if key not in grouped_paths:
-                grouped_paths[key] = {"start": node, "paths": {}}
-
-        for path in paths:
-            key = path["start"].id
-            end_key = path["end"].id
-            path["score"] = get_path_score(path)
-            del path["start"]
-            if end_key in grouped_paths[key]["paths"]:
-                if grouped_paths[key]["paths"][end_key]["score"] > path["score"]:
-                    grouped_paths[key]["paths"][end_key] = path
-            else:
-                grouped_paths[key]["paths"][end_key] = path
-        return grouped_paths
-
     def standards(self) -> List[str]:
         return self.neo_db.standards()
 
@@ -1767,3 +1762,43 @@ def dbCREfromCRE(cre: cre_defs.CRE) -> CRE:
         external_id=cre.id,
         tags=",".join(tags),
     )
+
+
+def gap_analysis(
+    neo_db: NEO_DB,
+    node_names: List[str],
+    store_in_cache: bool = False,
+    cache_key: str = "",
+):
+    base_standard, paths = neo_db.gap_analysis(node_names[0], node_names[1])
+    if base_standard is None:
+        return None
+    grouped_paths = {}
+    for node in base_standard:
+        key = node.id
+        if key not in grouped_paths:
+            grouped_paths[key] = {"start": node, "paths": {}}
+
+    for path in paths:
+        key = path["start"].id
+        end_key = path["end"].id
+        path["score"] = get_path_score(path)
+        del path["start"]
+        if end_key in grouped_paths[key]["paths"]:
+            if grouped_paths[key]["paths"][end_key]["score"] > path["score"]:
+                grouped_paths[key]["paths"][end_key] = path
+        else:
+            grouped_paths[key]["paths"][end_key] = path
+
+    if (
+        store_in_cache
+    ):  # lightweight memory option to not return potentially huge object and instead store in a cache,
+        # in case this is called via worker, we save both this and the caller memory by avoiding duplicate object in mem
+        conn = redis.from_url(os.getenv("REDIS_URL", "redis://localhost:6379"))
+        if cache_key == "":
+            cache_key = make_array_hash(node_names)
+
+        conn.set(cache_key, flask_json.dumps({"result": grouped_paths}))
+        return (node_names, {})
+
+    return (node_names, grouped_paths)
diff --git a/application/frontend/src/pages/GapAnalysis/GapAnalysis.tsx b/application/frontend/src/pages/GapAnalysis/GapAnalysis.tsx
@@ -1,5 +1,5 @@
 import axios from 'axios';
-import React, { useEffect, useState } from 'react';
+import React, { useEffect, useRef, useState } from 'react';
 import { useLocation } from 'react-router-dom';
 import {
   Accordion,
@@ -125,12 +125,14 @@ export const GapAnalysis = () => {
   const [CompareStandard, setCompareStandard] = useState<string | undefined>(
     searchParams.get('compare') ?? ''
   );
+  const [gaJob, setgaJob] = useState<string>('');
   const [gapAnalysis, setGapAnalysis] = useState<Record<string, GapAnalysisPathStart>>();
   const [activeIndex, SetActiveIndex] = useState<string>();
   const [loadingStandards, setLoadingStandards] = useState<boolean>(false);
   const [loadingGA, setLoadingGA] = useState<boolean>(false);
   const [error, setError] = useState<string | null | object>(null);
   const { apiUrl } = useEnvironment();
+  const timerIdRef = useRef<NodeJS.Timer>();
 
   const GetStrongPathsCount = (paths) =>
     Math.max(
@@ -156,13 +158,62 @@ export const GapAnalysis = () => {
     });
   }, [setStandardOptions, setLoadingStandards, setError]);
 
+  useEffect(() => {
+    console.log('gajob changed, polling');
+    const pollingCallback = () => {
+      const fetchData = async () => {
+        const result = await axios.get(`${apiUrl}/ma_job_results?id=` + gaJob, {
+          headers: {
+            'Cache-Control': 'no-cache',
+            Pragma: 'no-cache',
+            Expires: '0',
+          },
+        });
+        if (result.data.result) {
+          setLoadingGA(false);
+          setGapAnalysis(result.data.result);
+          setgaJob('');
+        }
+      };
+      if (!gaJob) return;
+      fetchData().catch((e) => {
+        setLoadingGA(false);
+        setError(e.response.data.message ?? e.message);
+      });
+    };
+
+    const startPolling = () => {
+      // Polling every 10 seconds
+      timerIdRef.current = setInterval(pollingCallback, 10000);
+    };
+    const stopPolling = () => {
+      clearInterval(timerIdRef.current);
+    };
+
+    if (gaJob) {
+      console.log('started polling');
+      startPolling();
+    } else {
+      console.log('stoped polling');
+      stopPolling();
+    }
+
+    return () => {
+      stopPolling();
+    };
+  }, [gaJob]);
+
   useEffect(() => {
     const fetchData = async () => {
       const result = await axios.get(
         `${apiUrl}/map_analysis?standard=${BaseStandard}&standard=${CompareStandard}`
       );
-      setLoadingGA(false);
-      setGapAnalysis(result.data);
+      if (result.data.result) {
+        setLoadingGA(false);
+        setGapAnalysis(result.data.result);
+      } else if (result.data.job_id) {
+        setgaJob(result.data.job_id);
+      }
     };
 
     if (!BaseStandard || !CompareStandard || BaseStandard === CompareStandard) return;