diff --git a/README.md b/README.md index 5c5445e..79e78fa 100644 --- a/README.md +++ b/README.md @@ -1 +1,24 @@ -# labs \ No newline at end of file +# DataStax Labs + +This repository contains documentation and supporting components for some +experimental and preview releases made available through the DataStax Labs +program. + +## Components + +### DataStax Graph Labs + +Refer to the [dsgraph](./dsgraph) directory. + +## Support + +The code, examples, and snippets provided in this repository are not intended +for production use and are not "Supported Software" under any DataStax +subscriptions or other agreements. + +## License + +Refer to the [DataStax Labs Terms][1] for full details on the license and terms +related to the use of the software described here. + +[1]: https://www.datastax.com/terms/datastax-labs-terms diff --git a/dsgraph/README.md b/dsgraph/README.md new file mode 100644 index 0000000..f17f181 --- /dev/null +++ b/dsgraph/README.md @@ -0,0 +1,213 @@ +# DataStax Graph Labs + +These are the instructions for installing and using the DataStax Graph +Labs preview. + +## What's New and Documentation + +The new, experimental graph engine included in this Labs package +allows for users to work seamlessly across Cassandra and Graph APIs +while accessing the data stored in Cassandra. This is possible because +the experimental graph engine has been embedded into Cassandra's data +model metadata. DataStax is calling this novel approach to graph data +management, DataStax's OneModel. + +Because this is an experimental preview of a new graph engine in +DataStax Graph, a subset of documentation is available that describes +the new concepts included in DataStax Graph. + +Please review the ./graph-docs/README.html file for an overview of the +features, behaviors, and functionality of the experimental and new +graph engine. + +### DataStax Studio Sample Notebooks + +In addition to the documentation included in the ./graph-docs +directory, DataStax is providing a set of Getting Started Studio +Notebooks to help users understand the key concepts of the +experimental graph engine build sample applications quickly. + +Three different Getting Started notebooks are provided in this +package: + +* DataStax Graph Gremlin - start here for a pure Gremlin experience +* DataStax Graph CQL as Graph - start here to use CQL to create a new graph. +* DataStax Graph CQL Conversion to Graph - start here to see how to convert + an existing CQL keyspace into a graph. + +The Studio sample notebook can be found under the directory: +./studio-getting-started-guide-notebook + +See the instructions under the DataStax Studio section further down in +this file to use these notebooks. + +### Classic Graph Schema Migration Example + +The DataStax Graph engine available in DSE 6.7 and below is now +referred to as DataStax's Classic graph engine. If you are interested +in learning how to migrate from an existing graph built using the +Classic graph engine into a graph that's compatible with the new, +experimental graph engine included in DataStax Labs, please contact a +DataStax Services professional and a member of DataStax's Graph +Practice will be in touch to discuss how DataStax can help. + +A simple migration example is available for review in this package. +Please review the file ./graph-migration/README.md + +## Installation + +### Prerequisites + +* Platform: Linux or MacOS (Studio can run on Windows but DSE server cannot) +* Java: Java 8 (1.8.0_151 minimum; Java 9 or higher not supported) +* Python: 2.7.x + +### DataStax Enterprise + +DataStax Enterprise Server is provided in tarball format for this labs +preview. To install you will follow similar instructions as explained +here for the latest released DSE which is version 6.7: + + +In summary you will need to first expand the DSE tarball: + + tar xvzf dse-6.8.0*.tar.gz + +Then make the default directories and change their ownership to the +current running user: + + sudo mkdir /var/{log,lib}/{dsefs,spark,cassandra} + sudo mkdir /var/lib/spark/{rdd,worker} + sudo chown -R $USER:$GROUP /var/{log,lib}/{dsefs,spark,cassandra} + +Alternatively, if you want to use different directory locations, see +the 6.7 documentation referenced above. + +Finally you can start DSE with DataStax Graph enabled: + + cd dse-6.8.0* + bin/dse cassandra -g + +To get the most value out of DataStax Graph, DSE Search is also +recommended to be enabled. + +To start DSE with DataStax Graph and DSE Search enabled, run this +command instead: + + cd dse-6.8.0* + bin/dse cassandra -s -g + +To leverage DSEGraphFrames or any other analytical tool, like Gremlin +OLAP, run this command instead: + + cd dse-6.8.0* + bin/dse cassandra -s -g -k + +After a few minutes, DSE should be running. You can confirm this with +a nodetool status command: + + bin/nodetool status + +In the status message you should see confirmation that the single node +is running and that Graph mode is enabled: + + DC: Graph Workload: Cassandra Graph: yes + ======================================================= + Status=Up/Down + |/ State=Normal/Leaving/Joining/Moving + -- Address Load Owns Token Rack Health [0,1] + UN 127.0.0.1 103.21 KiB ? -5463528168999689403 rack1 0.50 + +### DataStax Studio + +DataStax Studio is provided in tarball format for this labs preview. To +install you will follow similar instructions as explained here for the +latest released Studio which is version 6.7: + + +Important: If an earlier version of DataStax Studio is already +installed, back up the user data directory before you install the labs +version. The user data directory is normally located under +`user_home_directory/.datastax_studio` (see the docs linked above for +details). + +In summary you will need to first expand the Studio tarball: + + tar xvzf datastax-studio-6.8.0*.tar.gz + +Next start the Studio server: + + cd datastax-studio-6.8.0* + bin/server.sh + +Once Studio is running it should report a status message similar to: + + Studio is now running at: http://127.0.0.1:9091 + +Finally, to access DataStax Studio, switch to your web browser and +access this URL: + +For DSE running on localhost, the URI will be `localhost`. When DSE is +running on a different machine, the URI will be the hostname or IP +address for the remote machine. + +### Java Driver + +To use the Java driver with your application, follow these instructions. + +Unpack the `dse-java-driver.tar.gz` file. You should see the following +contents under the newly-created `dse-java-driver` folder: + + # Core DSE driver + dse-java-driver-core-1.8.1.20190510-LABS.jar + dse-java-driver-core-1.8.1.20190510-LABS-shaded.jar # shaded deps + dse-java-driver-core-1.8.1.20190510-LABS.pom + # Graph fluent API + dse-java-driver-graph-1.8.1.20190510-LABS.jar + dse-java-driver-graph-1.8.1.20190510-LABS.pom + # Object mapper + dse-java-driver-mapping-1.8.1.20190510-LABS.jar + dse-java-driver-mapping-1.8.1.20190510-LABS.pom + +You can then reference the above jars directly by placing them in your +application's classpath. + +Alternatively, you can install the above jars and poms in your local +Maven repository, so that they can be declared as regular dependencies +in your Maven or Gradle build. Refer to Maven's [Guide to installing +3rd party JARs] for further information. + +[Guide to installing 3rd party JARs]:https://maven.apache.org/guides/mini/guide-3rd-party-jars-local.html + +### Python Driver + +To use the Python driver with your application, follow these instructions. + +A virtual environment is recommended, especially so that these Labs +drivers don't mix or conflict with any released drivers on the same +system. For example to use pyenv with the virtualenv plugin you would +follow these steps to create a virtual environment: + + pyenv virtualenv 2.7.15 labs + pyenv activate labs + +Next install the main Python driver and the DataStax Graph +enhancements driver: + + pip install --upgrade pip + pip install dse-driver-2.8.1.20190509+labs.tar.gz + pip install dse-graph-1.6.0.20190509+labs.tar.gz + +If the install completes without any errors, you can confirm that both +drivers are loadable: + + python -c 'import dse_graph; print dse_graph.__version__' + python -c 'import dse; print dse.__version__' + +## Next Steps + +We want to hear your feedback! Go to the Labs section of the new +[DataStax Community forums](https://community.datastax.com/spaces/11/index.html). + +You can also reach out on the Labs forums for any help needed with +these installation steps. diff --git a/dsgraph/graph-docs/CQLGraphExtensions.md b/dsgraph/graph-docs/CQLGraphExtensions.md new file mode 100644 index 0000000..79fb55d --- /dev/null +++ b/dsgraph/graph-docs/CQLGraphExtensions.md @@ -0,0 +1,196 @@ +# CQL Grammar to specify Graph Metadata on Keyspaces/Tables +CQL extensions have been added to allow creation and maintenance of graph keyspaces directly via CQL. + +## How to specify that a Keyspace should be a Graph? +In order to treat a keyspace as a graph, one needs to specify **graph_engine** when creating/altering a keyspace: +``` +ALTER KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1} AND graph_engine = 'Native'; +``` +## Which Graph Engines are available? +The only engine type is **Native**. There's also **Classic** engine, but this can't be specified on a keyspace, because a **Classic** graph consists of multiple keyspaces and needs to be created via the traditional System API. + +`system_schema.keyspaces` will also properly reflect the graph engine: + +``` +cqlsh> select * from system_schema.keyspaces; + + keyspace_name | durable_writes | graph_engine | replication +--------------------+----------------+--------------+------------------------------------------------------------------------------------- + test | True | Native | {'class': 'org.apache.cassandra.locator.SimpleStrategy', 'replication_factor': '1'} + system_auth | True | null | {'class': 'org.apache.cassandra.locator.SimpleStrategy', 'replication_factor': '1'} + ... + system | True | null | {'class': 'org.apache.cassandra.locator.LocalStrategy'} + dse_perf | True | null | {'class': 'org.apache.cassandra.locator.SimpleStrategy', 'replication_factor': '1'} + system_traces | True | null | {'class': 'org.apache.cassandra.locator.SimpleStrategy', 'replication_factor': '2'} + dse_security | True | null | {'class': 'org.apache.cassandra.locator.SimpleStrategy', 'replication_factor': '1'} + +``` + +It is important to note that if a keyspace does not have **graph_engine** set, it won't be recognized as a graph. + +## What's the difference between the Graph engines? + +**Native** requires a keyspace and tables to specify how the Graph schema looks like. **Native** will keep all data of the graph in those tables. + +--- + +## How to specify that a table should be a Vertex Label? +This can be achieved by specifying `VERTEX LABEL ` when creating/altering a table. +If no name is specified, then the label will be equal to the table name. + +``` +CREATE TABLE test.person ( + firstname text, + lastname text, + age int, + jobtitle text, + phone text, + PRIMARY KEY ((firstname, lastname), age) +) WITH CLUSTERING ORDER BY (age ASC) AND VERTEX LABEL person_label; +``` + +Another vertex label `software_label` is being created, which will be used in later examples. +``` +CREATE TABLE test.software ( + software_name text, + version_info text, + software_age int, + description text, + PRIMARY KEY ((software_name, version_info), software_age) +) WITH CLUSTERING ORDER BY (software_age ASC) AND VERTEX LABEL software_label; + +``` + +`system_schema.vertices` keeps track of all existing vertex labels and their keyspace/table names. +``` +cqlsh> select * from system_schema.vertices; + + keyspace_name | table_name | label_name +---------------+------------+-------------- + test | person | person_label + test | software | software_label +``` + +--- +**NOTE** + +A vertex label name must be unique within a keyspace. Also one table can only have one graph label definition. + +--- + +## How to specify an Edge Label and what are the schema layout requirements? + +This can be done through `EDGE LABEL FROM vLabelOne(...) TO vLabelTwo(...)`. If no name is specified, then the label will be equal to the table name. +The below example shows how to create the connection `person-created->software`: +``` +CREATE TABLE test."personCreatedSoftware" ( + person_firstname text, + person_lastname text, + person_age int, + sw_name text, + sw_version_info text, + sw_age int, + creation_date date, + PRIMARY KEY ((person_firstname, person_lastname), person_age, sw_name, sw_version_info, sw_age) +) WITH CLUSTERING ORDER BY (person_age ASC, sw_name ASC, sw_version_info ASC, sw_age ASC) + AND EDGE LABEL created + FROM person_label((person_firstname, person_lastname), person_age) + TO software_label((sw_name, sw_version_info), sw_age); +``` + +--- +**NOTE** + +The v1-edge->v2 triplet must be unique within a keyspace. Also one table can only have one graph label definition. + +--- + +The three important things to look out for when creating an edge label table is: + +* the `PRIMARY KEY` definition +* the `FROM` and `TO` mapping definitions. Also `FROM` and `TO` must match existing vertex labels. +* type definitions of all mapping columns need to match the types defined in the vertex tables. + +Any mistakes there will be rejected at creation time of the edge label definition. + +##### PRIMARY KEY definition for Edge Label Tables +The partition keys of `person_label` also need to be partition keys in the edge table definition, which are in this case `person_firstname, person_lastname`. Clustering columns of `person_label` (`person_age`) and partition keys/clustering columns of `software_label` will end up all being clustering columns in the new edge table. +The final `PRIMARY KEY` definition will therefore be: +`PRIMARY KEY ((person_firstname, person_lastname), person_age, sw_name, sw_version_info, sw_age)` + +##### FROM/TO mapping definitions +Notice how the definitions in the `FROM` and `TO` part need to match the PK definitions of the vertex tables `person` and `software`. +Since the `FROM` part is for the `person` table, it needs to be `FROM person_label((person_firstname, person_lastname), person_age)`. +The `TO` part is for the `software` table and so that needs to be `TO software_label((sw_name, sw_version_info), sw_age)`. + +`system_schema.edges` keeps track of all existing edge labels and their keyspace/table names. In addition, it also keeps track of the **mapping** columns that are being used in order to connect a `person_label` to a `software_label`. +``` +cqlsh> select * from system_schema.edges ; + + keyspace_name | table_name | label_name | from_clustering_columns | from_partition_key_columns | from_table | to_clustering_columns | to_partition_key_columns | to_table +---------------+-----------------------+------------+-------------------------+-----------------------------------------+------------+-----------------------+--------------------------------+---------- + test | personCreatedSoftware | created | ['person_age'] | ['person_firstname', 'person_lastname'] | person | ['sw_age'] | ['sw_name', 'sw_version_info'] | software + +``` + +## How to customize Edge Table partitioning? + +Let's say one wants to partition an edge table by `X` and cluster it by `Y`. This can be easily achieved by specifying `X` and `Y` in the PK definitions in addition to the other mapping columns. + +``` +CREATE TABLE test.person__created__software ( + person_firstname text, + person_lastname text, + "X" int, + person_age int, + sw_name text, + sw_version_info text, + sw_age int, + "Y" int, + creation_date date, + PRIMARY KEY ((person_firstname, person_lastname, "X"), person_age, sw_name, sw_version_info, sw_age, "Y") +) WITH CLUSTERING ORDER BY (person_age ASC, sw_name ASC, sw_version_info ASC, sw_age ASC, "Y" ASC) + AND EDGE LABEL created + FROM person((person_firstname, person_lastname), person_age) + TO software((sw_name, sw_version_info), sw_age); +``` + +Running a traversal without specifying the additional partition key `X` will result in: +``` +g.V().outE("created") +One or more indexes are required to execute the traversal: g.V().outE("created") +Failed step: DseVertexStep(__.outE().hasLabel("created")) +CQL execution: No table or view could satisfy the query 'SELECT * FROM test.person__created__software WHERE person_firstname = ? AND person_lastname = ? AND person_age = ?' +The output of 'schema.indexFor().analyze()' suggests the following indexes could be created to allow execution: + +schema.edgeLabel('created').from('person').to('software').materializedView('person__created__software_by_firstname_lastname_age').ifNotExists().partitionBy('person_firstname').partitionBy('person_lastname').partitionBy('person_age').clusterBy('X', Asc).clusterBy('sw_name', Asc).clusterBy('sw_version_info', Asc).clusterBy('sw_age', Asc).clusterBy('Y', Asc).create() +``` +So in order for the traversal to succeed, one either needs to create an index, or include `X` in the traversal as shown below: +``` +g.V().outE("created").has("X", 1) +==>e[person:hans:wurst:23#68->created:1:2#45->software:DSE:123:3#06][person:hans:wurst:23#68-created->software:DSE:123:3#06] +``` + +## How to rename a Vertex/Edge Label? + +A vertex label can be renamed as shown below: +``` +ALTER TABLE test.person RENAME VERTEX LABEL TO "personX"; +``` + +An edge label can be renamed as shown below: +``` +ALTER TABLE test."personCreatedSoftware" RENAME EDGE LABEL TO "createdX"; +``` + +## How to remove a Vertex/Edge Label? + +A vertex label can be removed as shown below: +``` +ALTER TABLE test.person WITHOUT VERTEX LABEL "personX"; +``` + +An edge label can be removed as shown below: +``` +ALTER TABLE test."personCreatedSoftware" WITHOUT EDGE LABEL "createdX"; +``` diff --git a/dsgraph/graph-docs/ClassicToNativeGraphMigration.md b/dsgraph/graph-docs/ClassicToNativeGraphMigration.md new file mode 100644 index 0000000..73c5f5f --- /dev/null +++ b/dsgraph/graph-docs/ClassicToNativeGraphMigration.md @@ -0,0 +1,229 @@ +# Migrate Classic Engine Graph to Native Engine Graph +If you are an existing DataStax Graph user and are looking to upgrade to this version of DataStax Graph, then please contact your +DataStax representative to discuss best practices and lessons learned on how to do this with the DataStax services team. + +## Schema migration +The migration tool gives users a way of converting the schema of an existing Classic Engine graph to Native Engine graph. +The generated schema can either be CQL or Gremlin. + +## Restrictions +Native Engine graphs do not support meta/multi-properties, and these schema elements will be translated into regular single cardinality properties. +Indexing, caching and TTL will also be dropped from schema. See [Deprecated features](DeprecatedFeatures.md) for more information. + +## Usage +``` +dse graph-migrate-schema -cql|-gremlin +``` + +## Example +Start DSE with Analytics enabled: +``` +bin/dse cassandra -g -s -k +``` +Create a Classic Engine graph: +``` +bin/dse gremlin-console +system.graph('classic').engine(Classic).create() +:remote config alias g classic.g +schema.propertyKey('name').Text().single().ifNotExists().create() +schema.propertyKey('age').Bigint().single().ifNotExists().create() +schema.vertexLabel('person').partitionKey('name').properties('age').ifNotExists().create() +schema.vertexLabel('software').partitionKey('name').ifNotExists().create() +schema.edgeLabel('created').connection('person', 'software').create() +:exit +``` + +Create the Native Engine schema: +``` +dse graph-migrate-schema -gremlin classic migrated +``` +Output: +``` +system.graph('migrated'). + ifNotExists(). + withReplication("{ 'class' : 'org.apache.cassandra.locator.NetworkTopologyStrategy', 'SearchGraphAnalytics': '1' }"). + andDurableWrites(true). + create() + + +:remote config alias g migrated.g + +schema.vertexLabel('software'). + ifNotExists(). + partitionBy('name', Varchar). + create() +schema.vertexLabel('person'). + ifNotExists(). + partitionBy('name', Varchar). + property('age', Bigint). + create() +schema.edgeLabel('created'). + ifNotExists(). + from('person').to('software'). + clusterBy('id', Uuid, Asc). + create() +``` + +# Modeling guidance +## Multi/meta-properties +As multi and meta-properties have been removed users should consider modeling such items either as collections/UDTs or distinct elements. + +### Using a collection +Consider using a collection when searching for an entry point on the the graph for your traversal. +For instance a `list` (multi) of `Varchar`: +``` +schema.vertexLabel('person'). + ifNotExists(). + partitionBy('id', Varchar). + property('name', listOf(Text)). + create() + +schema.vertexLabel('person'). + secondaryIndex('person_2i_by_names'). + ifNotExists(). + by('name'). + create() + +g.addV('person').property('id', '1').property('name', ['alice', 'bob']) +g.V().has('name', contains('bob')) +``` + +### Using a collection of UDTs +Consider using a UDT if you need structured information. +For example a `list` (multi) of `addresse`s (meta): + +``` +schema.type('address'). + property('street1', Text). + property('street2', Text). + property('postCode', Text). + create() + +schema.vertexLabel('person'). + ifNotExists(). + partitionBy('id', Varchar). + property('address', listOf(frozen(typeOf('address')))). + create() + +schema.vertexLabel('person'). + secondaryIndex('person_2i_by_address'). + ifNotExists(). + by('address'). + indexValues(). + create() + +g.addV('person'). + property('id', '1'). + property('address', [[street1:'5b Tunn Street', + street2:'Fakenham', + postCode:'Norfolk'] as address]) +``` + +Note that UDT elements currently may only be searched for as atomic value. For instance it is not possible currently +to search for `address` by `street1` only. You must specify the full address: +``` +g.V().has('address', contains([street1:'5b Tunn Street', + street2:'Fakenham', + postCode:'Norfolk'] as address)) +``` + +### Using self edges +Use a loop edge if you need full index support. For instance, this model allows `name` edges (multi), that have a `since` property (meta): +``` +schema.vertexLabel('person'). + ifNotExists(). + partitionBy('id', Varchar). + create() + +schema.edgeLabel('has_name'). + ifNotExists(). + from('person'). + to('person'). + partitionBy(OUT, 'id'). + clusterBy('name', Text). + clusterBy(IN, 'id'). + property('since', Timestamp). + create() + +g.addV('person'). + property('id', '1').as('a'). + addE('has_name'). + from('a').to('a'). + property('name', 'bob'). + property('since', '2001-01-01T00:00:00Z' as Instant) +``` +Native Engine allows graph indexes for edges: +``` +schema.edgeLabel('has_name'). + from('person'). + to('person'). + materializedView('person__has_name__person_by_name'). + ifNotExists(). + partitionBy('name'). + clusterBy(OUT, 'id', Asc). + clusterBy(IN, 'id', Asc). + create() +``` +Which allows direct lookup of edges: +``` +g.E().hasLabel('has_name').has('name', 'bob').inV() +``` + +# Edge traversal in a Native Engine Graph +In Native Engine graphs the edge layout can be completely customized. This allows users to avoid indexes in many cases, an important +consideration for scalable graphs. + +## Native Engine Graphs use only one row per edge by default +Classic Engine allowed edge navigation in both directions by default by inserting two records per edge. +However, this did not come for free. In particular `in()` edges had the potential to cause large partitions. For instance: +``` +person-belongsTo->country +``` +A country may have millions of people, and the reverse records that allowed navigation from `country` to `person` cause the C* +cluster to become imbalanced due to the large numbers of edges in the same partition. + +It's also worth noting that typically OLTP traversals cannot realistically process every person in a country, and OLAP +traversals do not need the reverse record anyway. + +For these reasons Native Engine graphs are more conservative in edge record creation, creating only one record for each edge and +thus speeding up ingestion. This also allows external tools such as DSBulk to be used to insert data. + +### Indexing edges to allow navigation in different directions. +By default when creating an edge the default layout allows traversal via `out()`. For example creating an edge label +with the default layout: +``` +schema.edgeLabel('created'). + from('person'). + to('software'). + create() +``` +Traversal from `person` to `software` is possible. +``` +g.V(person).out() //Returns studio +``` +But trying to traverse in the opposite direction will throw an error. +``` +g.V(software).in() //ERROR +``` +Native Engine will tell you what index you need to create to allow this traversal in the error message. +Cut and paste the index schema statement to execute and create the index: +``` +One or more indexes are required to execute the traversal: g.V().hasLabel("software").has("name","studio").in() +Failed step: DseVertexStep(__.in()) +CQL execution: No table or view could satisfy the query 'SELECT * FROM crew.person__created__software WHERE software_name = ?' +The output of 'schema.indexFor().analyze()' suggests the following indexes could be created to allow execution: + +schema.edgeLabel('created').from('person').to('software').materializedView('person__created__software_by_software_name').ifNotExists().partitionBy(IN, 'name').clusterBy(OUT, 'name', Asc).create() + +Alternatively consider using: +g.with('ignore-unindexed') to ignore unindexed traversal. Your results may be incomplete. +g.with('allow-filtering') to allow filtering. This may have performance implications. +``` +The error message includes a suggested index that will allow you to run your queries. +``` +schema.edgeLabel('created').from('person').to('software').materializedView('person__created__software_by_software_name').ifNotExists().partitionBy(IN, 'name').clusterBy(OUT, 'name', Asc).create() + +g.V(software).in() //Returns person +``` + + diff --git a/dsgraph/graph-docs/DeprecatedFeatures.md b/dsgraph/graph-docs/DeprecatedFeatures.md new file mode 100644 index 0000000..f265a40 --- /dev/null +++ b/dsgraph/graph-docs/DeprecatedFeatures.md @@ -0,0 +1,100 @@ +# What are the deprecated Features from Classic compared to Native Engine? + +## Graph configuration + +Configuration options such as `allow_scan` / `schema_mode` / `evaluation_timeout` were completely removed. +The only traversal configuration that Native Engine supports are documented [here](TraversalOptions.md). + + +## dse.yaml settings +Classic Engine still supports **all** settings from `dse.yaml`. However, for Native Engine the only supported options are: + + * `analytic_evaluation_timeout_in_ms`: Maximum time to wait for an OLAP analytic (Spark) traversal to evaluate + * `realtime_evaluation_timeout_in_ms`: Maximum time to wait for an OLTP real-time traversal to evaluate + * `system_evaluation_timeout_in_ms`: Maximum time to wait for a graph-system request to evaluate. Creating/dropping a new graph is an example of a graph-system request + * `gremlin_server`: Different options that configure the Gremlin server + + +## Transactions + +Transaction handling has been changed: + + * `tx()` API calls will throw an unsupported operation exception. + * All mutations are executed once a traversal has been exhausted. There are no guarantees that this will not result in partial commits in the event of node failure. + * Mutations are no longer visible during the execution of a traversal. For instance: `g.addV('person').V()` will not return a vertex. + +Transactions are not supported in C* or DataStax Graph so we should generally move towards removing this terminology. + +## Multi/Meta-Properties + +Multi and meta-property support has been dropped. Use cases have previously included: + + * Access control: C* RLAC and RBAC are used to fulfill this use case. Unlike meta-properties, this solution will scale because CQL requests are pre-processed to check what data a user may see rather than fetching all data and filtering. + * Collections: Native Engine graphs support CQL collection types and may be queried via the new collection predicates: `contains`, `containsKey`, `containsValue`, `entryEq`. + * Time machine: This requires large amounts of filtering to achieve and does not scale outside of a small graphs. It also has the possibility of creating very large partitions. Users must explicitly create a model that takes timestamps into account. + * Entity resolution: Use regular vertices and edges to model these relationships. Use a separate vertex for contributing datasource, and use edges to link them to a resolved entity. + + +## Graph API / graph.addVertex / graph.addEdge + +The [graph API](https://docs.datastax.com/en/dse/6.0/dse-dev/datastax_enterprise/graph/reference/refGraphAPI.html) was removed. + +This means that `graph.addVertex(label, 'label_name', 'key', 'value', 'key', 'value')` / `vertex1.addEdge('edgeLabel', vertex2, [T.id, 'edge_id'], ['key', 'value'] [,...])` are not supported. + +Users must use the traversal API: `g.addV(vlabel).property('key', 'value')` / `g.addE(elabel).from(v1).to(v2).property('key', 'value')` respectively. + +In addition, elements that are returned from traversals are **reference** elements. These do not include property information outside the primary key and are **immutable**. +Users should use `.valueMap().by(unfold())` to retrieve the data they are interested in. + + +## Edge directionality + +In Classic Engine all edges were by default **bidirectional**. For performance reasons, edges are now always created in a **unidirectional** manner. + +The implication is that some traversals are not possible without additional steps. For example, neither `g.V(id).in()` nor `g.V(id).both()` are possible and would result in an error. + +To enable `.in()` or `.both()` a MV must be created on the edge table and this can be done transparently using the [Index Analyzer](IndexAnalyzer.md): `schema.indexFor(g.V(id).in()).apply()` + +--- + +**NOTE** + +There are significant advantages to using edges without a MV during data ingestion. Having a MV requires reading the old value before updating it, therefore resulting in a **read-before-write**. + +--- + +## Data types +Data types have been aligned with the java driver. All types that were supported in Classic are identical except `Duration`. + + +In Classic Engine graphs `Duration` is represented by `java.time.Duration`. + +In Native Engine graphs `Duration` is represented by `com.datastax.driver.core.Duration`. + +## Query caching +In Classic there was the the option to set graph and vertex queries to be cached. This is no longer an option in Native Engine graphs. We may add caching ability in a subsequent release. + +## TTL support +TTL support via schema currently requires users set this via CQL in Native Engine graphs. This is a feature that will be added in a subsequent release. + +## External ID construction +Native Engine does not support external ID construction and IDs must be obtained directly from elements if they are to be used for lookups. + +```g.V(id)``` + +Users that wish to look up elements by ID should instead use `.inject()` and `.has()`. For example: + +Get `bob`: `g.V().has('name', eq('bob'))` + +Get `bob` and `alice`: `g.V().has('name', within(['bob', 'alice']))` + + +## DGL + +DGL is deprecated and not supported by Native Engine. Users can instead use plain CQL for data ingestion or one of the bulk loading tools (GraphFrames, DS Bulk Loader). + + +## Lambdas + +Lambdas are currently not supported in Native Engine, but they may be enabled by default in a subsequent release. + diff --git a/dsgraph/graph-docs/DseGraphFrames.md b/dsgraph/graph-docs/DseGraphFrames.md new file mode 100644 index 0000000..ce4a5dd --- /dev/null +++ b/dsgraph/graph-docs/DseGraphFrames.md @@ -0,0 +1,1217 @@ +# DataStax Graph Frames +This guide is broken down into two main sections, a [DataStax Graph Frames API](#datastax-graph-frames-api) section describing fundamental methods for managing data, and a [Best Practices Guide for Loading Data](#best-practices-guide-for-loading-data) section highlighting recommended practices. + +The DseGraphFrame package provides a Spark API for bulk operations and analytics on DataStax Graph. It is inspired by Databricks’ GraphFrame library and supports a subset of Apache TinkerPop™ Gremlin graph traversal language. It supports reading of DataStax Graph data into a GraphFrame and writing GraphFrames from any format supported by Spark into DataStax Graph. +For a review of our initial offering and more introductory examples see our [Introducing DataStax Graph Frames](https://www.datastax.com/dev/blog/dse-graph-frame) blog post. + +### DataStax Graph Frames API +The following table shows the key methods available for managing data with DataStax Graph Frames. + +|Method|Result| +|----------|:-------------| +|gf()|GraphFrame object for graph frame API usage| +|V()|DseGraphTraversal\[Vertex\] object used to start a TinkerPop vertex traversal| +|E()|DseGraphTraversal\[Edge\] object used to start a TinkerPop edge traversal| +|io()|TP IOStep to export or inport graph from external source| +|[deleteVertices()](#deletevertices), [deleteEdges()](#deleteedges)|Delete vertices and edges| +|[deleteVertexProperties()](#deletevertexproperties), [deleteEdgeProperties()](#deleteedgeproperties)|Delete property values (does not change schema)| +|[updateVertices()](#updatevertices), [updateEdges()](#updateedges)|Update or insert properties, vertices, and edges| + +The examples shown in this section build upon the vertex and edge examples shown in [SystemAndSchemaApi docs](SystemAndSchemaAPI.md). +As a reminder, here are the schemas for these elements. +```groovy +gremlin> schema.vertexLabel('person'). + ifNotExists(). + partitionBy('name', Text). + partitionBy('ssn', Text). + clusterBy('age', Int). + property('address', Text). + property('coffeePerDay', Int). + create() + +gremlin> schema.vertexLabel('software'). + ifNotExists(). + partitionBy('name', Text). + clusterBy('version', Int). + clusterBy('lang', Text). + property('temp', Text). + property('static_property', Text, Static). + create() + +gremlin> schema.edgeLabel('created'). + ifNotExists(). + from('person').to('software'). + property('weight', Double). + create() +``` + +#### deleteVertices +Vertices for a given label can easily be deleted using `deleteVertices` with the vertex label to delete. +```scala +def deleteVertices(label: String): Unit +``` + +###### Example +```scala +scala> g.V().show(false) ++------------------------------+--------+-----+-------+------+-----------+----+---------------+-----+------------------+------------+ +|id |~label |name |version|lang |ssn |age |static_property|temp |address |coffeePerDay| ++------------------------------+--------+-----+-------+------+-----------+----+---------------+-----+------------------+------------+ +|software:timer:2.0:groovy#00 |software|timer|2.0 |groovy|null |null|beta |100 |null |null | +|software:chat:1.0:scala#20 |software|chat |1.0 |scala |null |null|united states |mambo|null |null | +|person:elmo:123-45-6789:4#33 |person |elmo |null |null |123-45-6789|4 |null |null |123 sesame street|1000 | +|person:rocco:111-11-1111:21#11|person |rocco|null |null |111-11-1111|21 |null |null |123 sesame street|100 | ++------------------------------+--------+-----+-------+------+-----------+----+---------------+-----+------------------+------------+ + +scala> g.deleteVertices("software") + +scala> g.V().show(false) ++------------------------------+------+-----+-------+----+-----------+---+---------------+----+------------------+------------+ +|id |~label|name |version|lang|ssn |age|static_property|temp|address |coffeePerDay| ++------------------------------+------+-----+-------+----+-----------+---+---------------+----+------------------+------------+ +|person:elmo:123-45-6789:4#33 |person|elmo |null |null|123-45-6789|4 |null |null|123 sesame street|1000 | +|person:rocco:111-11-1111:21#11|person|rocco|null |null|111-11-1111|21 |null |null|123 sesame street|100 | ++------------------------------+------+-----+-------+----+-----------+---+---------------+----+------------------+------------+ +``` + + +#### deleteEdges +This process is very similar to [updateEdges](#updateedges), but with an extra step between 3 and 4 that purges all the DataFrame columns except the newly add primary property key column names that were added in step 2. +Another key difference with step 4 is that we use `df.rdd.deleteFromCassandra` for carrying out the delete operation. + +```scala +def deleteEdges(df: DataFrame, cache: Boolean = true): Unit +``` +###### Example +Continuing from the previous example, deleting the `created` edge between `person` and `software` vertex is accomplished like this +```scala +scala> g.E.show(false) ++------------------------------+--------------------------+-------+------+ +|src |dst |~label |weight| ++------------------------------+--------------------------+-------+------+ +|person:rocco:111-11-1111:21#11|software:chat:1.0:scala#20|created|null | ++------------------------------+--------------------------+-------+------+ + + +scala> g.deleteEdges(edgeTarget) + +scala> g.E.show(false) ++---+---+------+------+ +|src|dst|~label|weight| ++---+---+------+------+ ++---+---+------+------+ +``` + +Where the `edgeTarget` is defined like this. +```scala +scala> edgeTarget.show(false) ++------------------------------+--------------------------+-------+ +|src |dst |~label | ++------------------------------+--------------------------+-------+ +|person:rocco:111-11-1111:21#11|software:chat:1.0:scala#20|created| ++------------------------------+--------------------------+-------+ +``` + +#### deleteVertexProperties +Deleting vertex properties can be done by providing a data frame with IDs of the vertices to target, a list of properties to delete, and optional parameters for isolating vertex labels and caching. +The API looks like this +```scala +def deleteVertexProperties(df: DataFrame, properties: Seq[String], labels: Seq[String] = Seq.empty, cache: Boolean = true): Unit +``` +###### Example +Say we want to delete the `coffeePerDay` property of the `person` vertex with name "rocco" +```scala +scala> g.V().show(false) ++------------------------------+------+-----+-------+----+-----------+---+---------------+----+------------------+------------+ +|id |~label|name |version|lang|ssn |age|static_property|temp|address |coffeePerDay| ++------------------------------+------+-----+-------+----+-----------+---+---------------+----+------------------+------------+ +|person:rocco:111-11-1111:21#11|person|rocco|null |null|111-11-1111|21 |null |null|123 sesame street|100 | ++------------------------------+------+-----+-------+----+-----------+---+---------------+----+------------------+------------+ +``` + +We could delete this vertex property like so +```scala +scala> g.deleteVertexProperties(target, Seq("coffeePerDay")) + +scala> g.V().show(false) ++------------------------------+------+-----+-------+----+-----------+---+---------------+----+------------------+------------+ +|id |~label|name |version|lang|ssn |age|static_property|temp|address |coffeePerDay| ++------------------------------+------+-----+-------+----+-----------+---+---------------+----+------------------+------------+ +|person:rocco:111-11-1111:21#11|person|rocco|null |null|111-11-1111|21 |null |null|123 sesame street|null | ++------------------------------+------+-----+-------+----+-----------+---+---------------+----+------------------+------------+ +``` + +Where the supplied data frame could take any of the following forms +```scala +scala> target.show(false) ++------------------------------+------+-----+-----------+---+ +|id |~label|name |ssn |age| ++------------------------------+------+-----+-----------+---+ +|person:rocco:111-11-1111:21#11|person|rocco|111-11-1111|21 | ++------------------------------+------+-----+-----------+---+ +``` + +```scala +scala> target.drop("id").show(false) ++------+-----+-----------+---+ +|~label|name |ssn |age| ++------+-----+-----------+---+ +|person|rocco|111-11-1111|21 | ++------+-----+-----------+---+ +``` + +```scala +scala> target.drop("~label").show(false) ++------------------------------+-----+-----------+---+ +|id |name |ssn |age| ++------------------------------+-----+-----------+---+ +|person:rocco:111-11-1111:21#11|rocco|111-11-1111|21 | ++------------------------------+-----+-----------+---+ +``` + +#### deleteEdgeProperties +The process is very similar to [updateEdges](#updateedges), but with an extra step between 3 and 4 that adds the supplied list of properties to the DataFrame, these are the properties to be deleted. +Another key difference with step 4 is that we omit the `.options(WriteConf.IgnoreNullsParam.sqlOption(true))` option which allows us to overwrite existing values with null properties. +```scala +def deleteEdgeProperties(df: DataFrame, properties: String*): Unit +``` + +###### Example +Given the following edge +```scala +scala> g.E().show(false) ++------------------------------+--------------------------+-------+------+ +|src |dst |~label |weight| ++------------------------------+--------------------------+-------+------+ +|person:rocco:111-11-1111:21#11|software:chat:1.0:scala#20|created|100.0 | ++------------------------------+--------------------------+-------+------+ +``` + +With the following data frame targeting this edge +```scala +scala> edgeTarget.show(false) ++------------------------------+--------------------------+-------+ +|src |dst |~label | ++------------------------------+--------------------------+-------+ +|person:rocco:111-11-1111:21#11|software:chat:1.0:scala#20|created| ++------------------------------+--------------------------+-------+ +``` + +Deleting the edge properties is as simple as this +```scala +scala> g.deleteEdgeProperties(edgeTarget, "weight") + +scala> g.E().show(false) ++------------------------------+--------------------------+-------+------+ +|src |dst |~label |weight| ++------------------------------+--------------------------+-------+------+ +|person:rocco:111-11-1111:21#11|software:chat:1.0:scala#20|created|null | ++------------------------------+--------------------------+-------+------+ +``` + +#### updateVertices +The `updateVertices` method comes in two flavors, updating multiple vertex labels or updating a single vertex label (new API). + +##### Updating multiple vertex labels +The API for updating multiple vertex labels looks like this +```scala +def updateVertices(df: DataFrame, labels: Seq[String] = Seq.empty, cache: Boolean = true): Unit +``` +Where the `df` is the data frame with vertex ID and columns to be updated. +The `labels` is used to group vertices within the same ID format, empty (means all) +The `cache` param indicates whether to cache the data frame before processing, it is set to true by default for consistence update and performance. + +Here is a simple example updating the person vertex table shown in the [SystemAndSchemaApi docs](SystemAndSchemaAPI.md) +As a reminder, the schema for the person and software vertex looks like this +```groovy +gremlin> schema.vertexLabel('person') + .ifNotExists() + .partitionBy('name', Text) + .partitionBy('ssn', Text) + .clusterBy('age', Int) + .property('address', Text) + .property('coffeePerDay', Int) + .create() + +gremlin> schema.vertexLabel('software') + .ifNotExists() + .partitionBy('name', Text) + .clusterBy('version', Int) + .clusterBy('lang', Text) + .property('temp', Text) + .property('static_property', Text, Static) + .create() +``` +Remember, DataStax Graph Frames represents a Graph as two virtual tables: a Vertex DataFrame and an Edge DataFrame. +So in the running example involving a `person` and `software` vertex, we will see a single Vertex DataFrame ecapsulating data for both. +```scala +scala> g.V().printSchema +root + |-- id: string (nullable = true) + |-- ~label: string (nullable = false) + |-- name: string (nullable = false) + |-- version: string (nullable = true) + |-- lang: string (nullable = true) + |-- ssn: string (nullable = true) + |-- age: integer (nullable = true) + |-- static_property: string (nullable = true) + |-- temp: string (nullable = true) + |-- address: string (nullable = true) + |-- coffeePerDay: integer (nullable = true) +``` + +Now suppose we want to update or insert vertex data for both `person` and `software` vertices, we would need to construct a data frame that looks like this. +```scala +scala> multiVertexLabelDF.show ++-----+--------+-----------+----+-----------------+------------+-------+-----+---------------+-----+ +| name| ~label| ssn| age| address|coffeePerDay|version| lang|static_property| temp| ++-----+--------+-----------+----+-----------------+------------+-------+-----+---------------+-----+ +|rocco| person|222-22-2222| 20|3975 Freedom Blvd| 2| null| null| null| null| +| chat|software| null|null| null| null| 1.0|scala| united states|mambo| ++-----+--------+-----------+----+-----------------+------------+-------+-----+---------------+-----+ + +``` +Note that we have a `~label` column and the necessary ID columns for both `person` (i.e. name, ssn, age) and `software` (i.e. name, version, lang) vertices. +Once the data frame is constructed properly, updating the vertices is straight forward. +```scala +scala> g.updateVertices(multiVertexLabelDF) +``` + +##### Updating a single vertex label (new API) +The API for updating a single vertex label is bit simpler. In this case the API looks like this. +```scala +def updateVertices(vertexLabel: String, df: DataFrame): Unit +``` +Where the `vertexLabel` specifies the single vertex label of intertest, and the data frame consists of the vertex IDs and +any additional columns targeted for updates. In the `person` vertex example the data frame would look like this +```scala +scala> personDF.show ++----+-----------+---+------------------+------------+ +|name| ssn|age| address|coffeePerDay| ++----+-----------+---+------------------+------------+ +|elmo|123-45-6789| 4| 123 sesame street| 1000| ++----+-----------+---+------------------+------------+ +``` +Once again updating is simple. +```scala +scala> g.updateVertices("person", personDF) +``` + +#### updateEdges +Similar to vertex updates, `updateEdges` also comes in two flavors, updating multiple or single edge labels. + +##### Updating multiple edge labels +The process for updating edges and ultimately **mapping column names** follow this process: + +1. `updateEdges()` takes a DataFrame and using the `src`, `dst`, and `~label` columns builds a list of `EdgeLabel` objects. +2. For each edge label, prepare and filter edge IDs using `prepareAndFilterEdgeIds`. This takes a data frame that has a schema with `src`, `dst`, and `~label` columns and replaces `src` with the edge label's out-vertex primary property key column names. It does the same for `dst`, replacing this with the edge label's in-vertex primary property key column names. So in the end, the DataFrame is transformed with a schema with `~label`, out-vertex column names, and in-vertex column names in that order. +3. The `~label` column is then dropped on the returned DataFrame. +4. Then updates the DSE table using the `df.write.cassandraFormat...` path. + +The API for the multi-edge update method looks like this. +```scala +def updateEdges(df: DataFrame, cache: Boolean = true): Unit +``` + +Where the `df` data frame contains the edge IDs and columns to be updated. When `cache` is true (default), the data frame is explicitly cached before processing, and uncached upon completion. + +###### Example + +Let's imagine we have 2 `person` and `software` vertices each, and no edges defined. +``` +cqlsh> select * from test.person; + + name | ssn | age | address | coffeePerDay +-------+-------------+-----+--------------------+-------------- + elmo | 123-45-6789 | 4 | 123 sesame street | 1000 + rocco | 111-11-1111 | 21 | 123 sesame street | 100 + +(2 rows) +cqlsh> select * from test.software; + + name | version | lang | static_property | temp +-------+---------+--------+-----------------+------- + timer | 2.0 | groovy | beta | 100 + chat | 1.0 | scala | united states | mambo + +(2 rows) +``` + +As a reminder, from DataStax Graph Frames it looks like this +```scala +scala> g.V.show(false) ++------------------------------+--------+-----+-------+------+-----------+----+---------------+-----+------------------+------------+ +|id |~label |name |version|lang |ssn |age |static_property|temp |address |coffeePerDay| ++------------------------------+--------+-----+-------+------+-----------+----+---------------+-----+------------------+------------+ +|software:timer:2.0:groovy#00 |software|timer|2.0 |groovy|null |null|beta |100 |null |null | +|software:chat:1.0:scala#20 |software|chat |1.0 |scala |null |null|united states |mambo|null |null | +|person:elmo:123-45-6789:4#33 |person |elmo |null |null |123-45-6789|4 |null |null |123 sesame street |1000 | +|person:rocco:111-11-1111:21#11|person |rocco|null |null |111-11-1111|21 |null |null |123 sesame street |100 | ++------------------------------+--------+-----+-------+------+-----------+----+---------------+-----+------------------+------------+ + +scala> g.E.show(false) ++---+---+------+------+ +|src|dst|~label|weight| ++---+---+------+------+ ++---+---+------+------+ + +``` + +Now updating or inserting multiple edge labels is fairly straight forward, once a data frame is properly constructed. +We need a data frame that encapsulated the source and destination vertices, edge label name, and any additional properties targeted for updating. +In this example, the data frame could look like this +```scala +scala> newEdgeDataDF.show(false) ++------------------------------+----------------------------+-------+------+ +|src |dst |~label |weight| ++------------------------------+----------------------------+-------+------+ +|person:rocco:111-11-1111:21#11|software:chat:1.0:scala#20 |created|10.0 | ++------------------------------+----------------------------+-------+------+ +``` +Note, this was an simplified example only showing a __single edge label__ ("created"), but a user could mix in different edge labels +by specifying the proper `~label` name and remaining fields using the principles described here. + +With this data frame, we simply pass this along to `updateEdges` like so +```scala +scala> g.updateEdges(newEdgeDataDF) +``` + +But how is this data frame constructed? Let's take deeper look at one example. In the above case we have a data frame with a +row describing a `created` edge from a `person` (rocco) to a `software` vertex (chat). +Let's start by creating a simple person data frame from scratch that contains all the fields needed for describing a `person` vertex. +```scala +scala> val roccoDF = Seq(("rocco","111-11-1111",21,"123 sesame street",100)).toDF("name","ssn","age","address","coffeePerDay") +roccoDF: org.apache.spark.sql.DataFrame = [name: string, ssn: string ... 3 more fields] + +scala> roccoDF.show(false) ++-----+-----------+---+------------------+------------+ +|name |ssn |age|address |coffeePerDay| ++-----+-----------+---+------------------+------------+ +|rocco|111-11-1111|21 |123 sesame street|100 | ++-----+-----------+---+------------------+------------+ +``` + +When specifying an edge we need a `src` and `dst` column, each references a DSE-generated ID representing the source and destination vertices respectively. +`idColumn()` is a helper method that allows a user to construct these `src` and `dst` fields. +To construct the `src` ID we supply the label name, and vertex primary keys. Here is an example showing the generated ID. +```scala +scala> roccoDF.select(g.idColumn(lit("person"), col("name"), col("ssn"), col("age")) as "src").show(false) ++------------------------------+ +|src | ++------------------------------+ +|person:rocco:111-11-1111:21#11| ++------------------------------+ +``` + +We do the same for the destination vertex, in this case a `software` vertex +```scala +scala> chatDF.select(g.idColumn(lit("software"), col("name"), col("version"), col("lang")) as "dst").show(false) ++--------------------------+ +|dst | ++--------------------------+ +|software:chat:1.0:scala#20| ++--------------------------+ +``` + +Now to construct the entire edge description, we simply need to add the `~label` and any additional properties targeted for updating. +```scala +scala> val srcDF = roccoDF.select(g.idColumn(lit("person"), col("name"), col("ssn"), col("age")) as "src") +srcDF: org.apache.spark.sql.DataFrame = [src: string] + +scala> val dstDF = chatDF.select(g.idColumn(lit("software"), col("name"), col("version"), col("lang")) as "dst") +dstDF: org.apache.spark.sql.DataFrame = [dst: string] + +scala> val personCreatedSoftwareDF = srcDF.crossJoin(dstDF).crossJoin(Seq(("created")).toDF("~label")).crossJoin(Seq((10.0)).toDF("weight")) +personCreatedSoftwareDF: org.apache.spark.sql.DataFrame = [src: string, dst: string ... 2 more fields] + +scala> personCreatedSoftwareDF.show(false) ++------------------------------+--------------------------+-------+------+ +|src |dst |~label |weight| ++------------------------------+--------------------------+-------+------+ +|person:rocco:111-11-1111:21#11|software:chat:1.0:scala#20|created|10.0 | ++------------------------------+--------------------------+-------+------+ +``` +A user could repeat the same pattern for more edge labels and join results for constructing a single data frame with multiple edge labels targeted for updating. +```scala +scala> val newEdgeDataDF = personCreatedSoftwareDF.join(elmoCreatedTimerDF, Seq("src", "dst", "~label", "weight"), "full") +newEdgeDataDF: org.apache.spark.sql.DataFrame = [src: string, dst: string ... 2 more fields] + +scala> newEdgeDataDF.show(false) ++------------------------------+----------------------------+-------+------+ +|src |dst |~label |weight| ++------------------------------+----------------------------+-------+------+ +|person:elmo:123-45-6789:4#33 |software:timer:2.0:groovy#00|created|100.0 | +|person:rocco:111-11-1111:21#11|software:chat:1.0:scala#20 |created|10.0 | ++------------------------------+----------------------------+-------+------+ +``` +Note, using the `src`, `dst`, and `~label` fields are the key components allowing a user to mix multiple edge labels in a single update operation. +For example, if we also had a "destroyed" edge label the data frame may look like this for updating/inserting both "created" and "destroyed" edge labels. +```scala ++------------------------------+----------------------------+---------+------+ +|src |dst |~label |weight| ++------------------------------+----------------------------+---------+------+ +|person:rocco:111-11-1111:21#11|software:vlog:0.1:go#40 |destroyed|null | +|person:rocco:111-11-1111:21#11|software:chat:1.0:scala#20 |created |10.0 | ++------------------------------+----------------------------+---------+------+ +``` + + +##### Updating a single edge label (new API) +In most cases, a user is interested in updating a single edge label, and reasoning about the usage of `g.idcColumn()` may be less obvious for new users. +This new API was developed in response to these concerns and provides a simpler interface for common usage patterns. +```scala +def updateEdges(outVertexLabel: String, edgeLabel: String, inVertexLabel: String, df: DataFrame): Unit +``` +The `df` data frame paramater contains the edge ID column names. Note, these column names should match the underlying +table columns defined in DataStax Enterprise (DSE). + +###### Example + +Remember the `created` edge table is structured like this in DSE +``` +CREATE TABLE test.person__created__software ( + person_name text, + person_ssn text, + person_age int, + software_name text, + software_version text, + software_lang text, + weight double, + PRIMARY KEY ((person_name, person_ssn), person_age, software_name, software_version, software_lang) +) +``` + +Imagine we start with two data frames containing information on the vertices we wish to connect with an edge. +```scala +scala> roccoDF.show(false) ++-----+-----------+---+------------------+------------+ +|name |ssn |age|address |coffeePerDay| ++-----+-----------+---+------------------+------------+ +|rocco|111-11-1111|21 |123 sesame street|100 | ++-----+-----------+---+------------------+------------+ + +scala> chatDF.show(false) ++----+-------+-----+---------------+-----+ +|name|version|lang |static_property|temp | ++----+-------+-----+---------------+-----+ +|chat|1.0 |scala|united states |mambo| ++----+-------+-----+---------------+-----+ +``` + +We need to create single data frame containing the columns comprising the edge table's primary keys, with any additional properties we want to update. +```scala +scala> val df1 = roccoDF.select($"name" as "person_name", $"ssn" as "person_ssn", $"age" as "person_age") +df1: org.apache.spark.sql.DataFrame = [person_name: string, person_ssn: string ... 1 more field] + +scala> val df2 = chatDF.select($"name" as "software_name", $"version" as "software_version", $"lang" as "software_lang", lit(100.0) as "weight") +df2: org.apache.spark.sql.DataFrame = [software_name: string, software_version: string ... 2 more fields] + +scala> val createdDF = df1.crossJoin(df2) +createdDF: org.apache.spark.sql.DataFrame = [person_name: string, person_ssn: string ... 5 more fields] + +scala> createdDF.show(false) ++-----------+-----------+----------+-------------+----------------+-------------+------+ +|person_name|person_ssn |person_age|software_name|software_version|software_lang|weight| ++-----------+-----------+----------+-------------+----------------+-------------+------+ +|rocco |111-11-1111|21 |chat |1.0 |scala |100.0 | ++-----------+-----------+----------+-------------+----------------+-------------+------+ +``` + +Now that `createdDF` contains all the necessary fields we simply provide it to `updateEdges` along with the vertex and edge label names. +```scala +scala> g.updateEdges("person", "created", "software", createdDF) + +scala> g.E.show(false) ++------------------------------+--------------------------+-------+------+ +|src |dst |~label |weight| ++------------------------------+--------------------------+-------+------+ +|person:rocco:111-11-1111:21#11|software:chat:1.0:scala#20|created|100.0 | ++------------------------------+--------------------------+-------+------+ +``` +Note, this was a simplified example, `createdDF` can have several rows for a __single edge label__. + +#### IOStep io() (TinkerPop 3.4) + +The wrapper around updateEdges() and updateVertices() and df() methods. It provides import/export capabilities to +DseGraphFrame with new TinkerPop API. The function parameter should be URL to the distributed file system. +JDBC data source and some others do not use the url but only parameters, please still provide non-empty string for them + +By default io().write() call will create to directory in provided DSEFS directory: "vertices" and "edges" +and store vertex and edge files in parquet format there. + +```scala +scala> g.io("dsefs:///tmp/graph").write().iterate() + +``` + +to restore saved data create new graph with the same schema and call: + +```scala +scala> g.io("dsefs:///tmp/graph").read().iterate() +``` +The DGF IoStep follows TP extention convention. If directory name has "parquet", "csv", "orc", "json" extensions, +coresponded format will be used for loading or saving data. "format" parameter override that convention + +you can save and load data in any format supported by spark. use with() modificator to pass "format" and format related +options. Here is an example how to save graph with multi-line strings to CSV: + +```scala +scala> g.io("dsefs:///tmp/graph").`with`("format", "csv").`with`("multiLine").write().iterate() +``` +Format related options are spark options + +Vertices and edges could be export and import separately. Use "vertices" and "edges" paramters for this + +```scala +g.io("dsefs:///tmp/1.json").`with`("vertices").write().iterate() +g.io("dsefs:///tmp/1.json").`with`("edges").write().iterate() +``` + +One label edges can be loaded with decoded ids. see `updateEdges(outVertexLabel, edgeLabel, inVertexLabel, df)` documentation for details + +```scala +g.io("dsefs://tmp/data.csv").`with`("outVertexLabel", "god").`with`("edgeLabel", "self").`with`("inVertexLabel", "god") + .`with`("header").read().iterate() +``` + +To load singe vertex label use "vertexLabel" option: +```scala +g.io("dsefs://tmp/data.csv").`with`("vertexLabel", "god").`with`("header").read().iterate() +``` + +### Best Practices Guide for Loading Data +#### Common Pitfalls +##### Null unset issue can cause excessive tombstones +In version prior to 6.0.7 and 6.7.3, if a user omitted columns during DataStax Graph Frames edge updates, +the missing columns fields were implicitly written to DSE with `null` values, causing unintended deletions, +tombstone build up, and ultimately excessive stress on the system. + +The workaround at the time was to set `spark.setCassandraConf(Map("spark.cassandra.output.ignoreNulls" -> "true"))`, +which will ignore unset or null-valued columns and not create unintended deletions on the server side. +In DSE versions 6.0.7, 6.7.3, and higher the default for `ignoreNulls` is true. + +##### Unintended caching can lead to OOM exceptions +Prior to DSE versions 5.1.14, 6.0.5, and 6.7.1, a problem existed such that during a DataStax Graph Frame bulk loading job, +the Spark cache was being used by default, but not explicitly emptied. This lead to OutOfMemory(OOM) errors and other issues. + +`spark.dse.graphframes.update.persistLevel` Spark Cassandra Connector parameter was +introduced that allows better control over Spark caching levels. + +Additionally, a new cache parameter was also introduced in the multi-label update methods that can be used as a workaround +if the user wishes to explicitly uncache data after use. See [updateVertices()](#updatevertices) and [updateEdges()](#updateedges) +for more details. + +##### How to workaround Materialized Views during bulk loading +When indexing with Materialized Views is desired, it is often recommended to enable this after the data has been loaded +because it significantly affects insertions performance. We expect about a 10% performance penalty per MV, and there are +some subtleties to be aware of when defining the data model, see this [blog](https://www.datastax.com/dev/blog/materialized-view-performance-in-cassandra-3-x) for more details. + +After data is loaded, and one enables indexing, how do we know when it's done? There is a [nodetool viewbuildstatus](https://docs.datastax.com/en/dse/6.7/dse-admin/datastax_enterprise/tools/nodetool/toolsViewBuildStatus.html) command +for accomplishing exactly this. + +##### How to model multi/meta-properties +Multi and meta-properties are modeled differently in Classic DataStax Graph compared to the new DataStax Graph. In the new +DataStax Graph, multi and meta-properties have been removed, and it is recommended to model them with collections/UDTs or +distinct elements. We will cover some of these details here. For broader guidance on migrating from Classic to the new +DataStax Graph see the [ClassicToNativeGraphMigration](ClassicToNativeGraphMigration.md) write-up. + +###### How to manage multi/meta-properties for Classic DataStax Graph +Here is an example of updating vertex multi and meta-properties. Suppose we start with the schema shown directly below. +Notice the god vertex has a multi-property called `nicknames`, which itself has meta-properties named `time` and `date`, +we will show how to update all these properties. +```groovy +// properties +schema.propertyKey("time").Timestamp().single().create() +schema.propertyKey("reason").Text().single().create() +schema.propertyKey("age").Int().single().create() +schema.propertyKey("name").Text().single().create() +schema.propertyKey("date").Date().single().create() +schema.propertyKey("nicknames").Text().multiple().create() +schema.propertyKey("nicknames").properties("time", "date").add() + +// vertex labels +schema.vertexLabel("god").properties("name", "age", "nicknames").create()schema.vertexLabel("god").index("god_age_index").secondary().by("age").add() +schema.vertexLabel("god").index("god_name_index").secondary().by("name").add() +... + +// add vertex +Vertex jupiter = graph.addVertex(T.label, "god", "name", "jupiter", "age", 5000); +Vertex juno = graph.addVertex(T.label, "god", "name", "juno", "age", 5000); +Vertex minerva = graph.addVertex(T.label, "god", "name", "minerva", "age", 5000); +``` + +This gives us something like this to start with. Notice none of the vertices have nicknames set yet, omitted from the +gremlin console and represented with `null` from the DataStax Graph Frames view. +```groovy +gremlin> g.V().has("name", "jupiter").properties().valueMap(true) +==>{id={~label=age, ~out_vertex={~label=god, community_id=827187456, member_id=0}, ~local_id=00000000-0000-8001-0000-000000000000}, key=age, value=5000} +==>{id={~label=name, ~out_vertex={~label=god, community_id=827187456, member_id=0}, ~local_id=00000000-0000-8002-0000-000000000000}, key=name, value=jupiter} + +gremlin> g.V().has("name", "juno").properties().valueMap(true) +==>{id={~label=age, ~out_vertex={~label=god, community_id=1316484224, member_id=0}, ~local_id=00000000-0000-8001-0000-000000000000}, key=age, value=5000} +==>{id={~label=name, ~out_vertex={~label=god, community_id=1316484224, member_id=0}, ~local_id=00000000-0000-8002-0000-000000000000}, key=name, value=juno} + +gremlin> g.V().has("name", "minerva").properties().valueMap(true) +==>{id={~label=age, ~out_vertex={~label=god, community_id=2114931072, member_id=0}, ~local_id=00000000-0000-8001-0000-000000000000}, key=age, value=5000} +==>{id={~label=name, ~out_vertex={~label=god, community_id=2114931072, member_id=0}, ~local_id=00000000-0000-8002-0000-000000000000}, key=name, value=minerva} +``` + +Here is what it looks like from DataStax Graph Frames (DGF). +```scala +scala> val g = spark.dseGraph("gods") + +scala> g.V().show(false) ++--------------------+------+------------+---------+-------+----+---------+ +|id |~label|community_id|member_id|name |age |nicknames| ++--------------------+------+------------+---------+-------+----+---------+ +|god:MU3hAAAAAAAAAAAA|god |827187456 |0 |jupiter|5000|null | +|god:Tnf0gAAAAAAAAAAA|god |1316484224 |0 |juno |5000|null | +|god:fg9JgAAAAAAAAAAA|god |2114931072 |0 |minerva|5000|null | ++--------------------+------+------------+---------+-------+----+---------+ +``` + +Now let’s see how we can add nicknames and its meta-properties. First construct a data frame consisting of `community_id` and +`member_id` of the vertex in question, along with the nicknames property and meta-properties we wish to update. +```scala +scala> val df = Seq(("827187456", "0", "overlords", java.sql.Date.valueOf("2017-01-01"), new java.sql.Timestamp(100L))).toDF("community_id", "member_id", "nicknames", "date", "time") + +scala> df.show(false) ++------------+---------+---------+----------+---------------------+ +|community_id|member_id|nicknames|date |time | ++------------+---------+---------+----------+---------------------+ +|827187456 |0 |overlords|2017-01-01|1969-12-31 16:00:00.1| ++------------+---------+---------+----------+---------------------+ +``` + +Now we create a new data frame consisting of just the id of the vertex and the nickname property to update. Notice special +care is taken in constructing this `id` field, composed of values of the known `~label`, `community_id`, and `member_id`. +```scala +scala> val updateDF = df.select(g.idColumn("god", $"community_id", $"member_id") as "id", array(struct($"nicknames", $"date", $"time")) as "nicknames") +``` + +Notice how we constructed the nicknames fields in this data frame, it is an array of `struct` types. +```scala +scala> updateDF.printSchema +root + |-- id: string (nullable = false) + |-- nicknames: array (nullable = false) + | |-- element: struct (containsNull = false) + | | |-- nicknames: string (nullable = true) + | | |-- date: date (nullable = true) + | | |-- time: timestamp (nullable = true) + +scala> updateDF.show(false) ++--------------------+------------------------------------------------+ +|id |nicknames | ++--------------------+------------------------------------------------+ +|god:MU3hAAAAAAAAAAAA|[[overlords, 2017-01-01, 1969-12-31 16:00:00.1]]| ++--------------------+------------------------------------------------+ +``` + +Now we update vertices using this updated data frame. Notice we are using the multi-vertex label flavor of the vertex +update method described in [Updating multiple vertex labels](#updating-multiple-vertex-labels). +```scala +scala> g.updateVertices(updateDF) + +scala> g.V().show(false) ++--------------------+------+------------+---------+-------+----+------------------------------------------------+ +|id |~label|community_id|member_id|name |age |nicknames | ++--------------------+------+------------+---------+-------+----+------------------------------------------------+ +|god:MU3hAAAAAAAAAAAA|god |827187456 |0 |jupiter|5000|[[overlords, 2017-01-01, 1969-12-31 16:00:00.1]]| +|god:Tnf0gAAAAAAAAAAA|god |1316484224 |0 |juno |5000|null | +|god:fg9JgAAAAAAAAAAA|god |2114931072 |0 |minerva|5000|null | ++--------------------+------+------------+---------+-------+----+------------------------------------------------+ +``` + +Another less commonly used approach is to use TinkerPop updates with DataStax Graph Frames +```scala +scala> g.V().has("community_id", "827187456").has("member_id", "0").has("~label", "god").property("nicknames", "overlords", "date", java.sql.Date.valueOf("2017-01-01"), "time", new java.sql.Timestamp(100L)).iterate() +``` + +It is worth noting that regardless of the approach used, updateVertices or TinkerPop updates with DGF, multi-properties are append only. In the example below, see what happens when we repeatedly update the nicknames property of the same vertex. +```scala +scala> g.V().has("community_id", "827187456").has("member_id", "0").has("~label", "god").property("nicknames", "overlords", "date", java.sql.Date.valueOf("2017-01-01"), "time", new java.sql.Timestamp(100L)).iterate() + +scala> g.V().show(false) ++--------------------+------+------------+---------+-------+----+------------------------------------------------------------------------------------------------+ +|id |~label|community_id|member_id|name |age |nicknames | ++--------------------+------+------------+---------+-------+----+------------------------------------------------------------------------------------------------+ +|god:fg9JgAAAAAAAAAAA|god |2114931072 |0 |minerva|5000|null | +|god:MU3hAAAAAAAAAAAA|god |827187456 |0 |jupiter|5000|[[overlords, 2017-01-01, 1969-12-31 16:00:00.1], [overlords, 2017-01-01, 1969-12-31 16:00:00.1]]| +|god:Tnf0gAAAAAAAAAAA|god |1316484224 |0 |juno |5000|null | ++--------------------+------+------------+---------+-------+----+------------------------------------------------------------------------------------------------+ +``` + +Now update one more time using the same data frame used previously. +```scala +scala> g.updateVertices(updateDF) + +scala> g.V().show(false) ++--------------------+------+------------+---------+-------+----+------------------------------------------------------------------------------------------------------------------------------------------------+ +|id |~label|community_id|member_id|name |age |nicknames | ++--------------------+------+------------+---------+-------+----+------------------------------------------------------------------------------------------------------------------------------------------------+ +|god:fg9JgAAAAAAAAAAA|god |2114931072 |0 |minerva|5000|null | +|god:MU3hAAAAAAAAAAAA|god |827187456 |0 |jupiter|5000|[[overlords, 2017-01-01, 1969-12-31 16:00:00.1], [overlords, 2017-01-01, 1969-12-31 16:00:00.1], [overlords, 2017-01-01, 1969-12-31 16:00:00.1]]| +|god:Tnf0gAAAAAAAAAAA|god |1316484224 |0 |juno |5000|null | ++--------------------+------+------------+---------+-------+----+------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +As expected, now we see the jupiter vertex with `nicknames` set with meta-properties. +```groovy +gremlin> g.V().has("name", "jupiter").properties().valueMap(true) +==>{id={~label=age, ~out_vertex={~label=god, community_id=827187456, member_id=0}, ~local_id=00000000-0000-8001-0000-000000000000}, key=age, value=5000} +==>{id={~label=name, ~out_vertex={~label=god, community_id=827187456, member_id=0}, ~local_id=00000000-0000-8002-0000-000000000000}, key=name, value=jupiter} +==>{id={~label=nicknames, ~out_vertex={~label=god, community_id=827187456, member_id=0}, ~local_id=5c43be20-468c-11e9-abad-a5d0ff50b75d}, key=nicknames, value=overlords, date=2017-01-01, time=1970-01-01T00:00:00.100Z} +==>{id={~label=nicknames, ~out_vertex={~label=god, community_id=827187456, member_id=0}, ~local_id=98261110-468f-11e9-abad-a5d0ff50b75d}, key=nicknames, value=overlords, date=2017-01-01, time=1970-01-01T00:00:00.100Z} +==>{id={~label=nicknames, ~out_vertex={~label=god, community_id=827187456, member_id=0}, ~local_id=00000000-0000-8004-0000-000000000000}, key=nicknames, value=overlords, date=2017-01-01, time=1970-01-01T00:00:00.100Z} +``` + +###### How to manage multi/meta-properties for the new DataStax Graph + +With the new DataStax Graph engine, a user has two options for handling multi and meta-properties: use CQL collection +types or drop them altogether. Details on data modeling changes between the Classic DataStax Graph and the new DataStax Graph +Engine can be found in our [ClassicToNativeGraphMigration](ClassicToNativeGraphMigration.md) write-up. Here is an example +of updating vertex labels with complex types which includes User Defined Types (UDTs), collections, and nested collections. + +Here is how we can define the new DataStax Graph data model using complex types. +```groovy +system.graph("complex").create() +:remote config alias g complex.g + +schema.type('udt1').ifNotExists().property('name', Varchar).create() + +schema.vertexLabel('collection').ifNotExists().partitionBy('name', Varchar).property('frozenList', frozen(listOf(Int))).property('frozenMap', frozen(mapOf(Int, Varchar))).property('frozenSet', frozen(setOf(Int))).property('list', listOf(Int)).property('map', mapOf(Int, Varchar)).property('set', setOf(Int)).create() + +schema.vertexLabel('person').ifNotExists().partitionBy('name', Varchar).property('udt1', typeOf('udt1')).property('udt2', mapOf(Varchar, frozen(typeOf('udt1')))).create() + +schema.vertexLabel('software').ifNotExists().partitionBy('name', Varchar).property('versions1', tupleOf(Int, Varchar)).property('versions2', tupleOf(frozen(setOf(Int)), frozen(listOf(Varchar)))).create() + +// add data +g.addV('person').property( 'name', 'A').property('udt1', typeOf('udt1').create('B')).property('udt2', ['B': typeOf('udt1').create('C')]) + +g.addV('software').property( 'name', 'B').property('versions1', tupleOf(Int, Varchar).create(2, '3')).property('versions2', tupleOf(frozen(setOf(Int)), frozen(listOf(Varchar))).create([1, 2, 3].toSet(), ['1', '2', '3'])) + +g.addV('collection').property( 'name', 'C').property('map', [1:'1', 2:'2', 3:'3']).property('frozenMap', [1:'1', 2:'2', 3:'3']).property('set', [1, 2, 3].toSet()).property('frozenSet', [1, 2, 3].toSet()).property('list', [1, 2, 3]).property('frozenList', [1, 2, 3]) +``` + +From DataStax Graph Frames (DGF) it looks like this. +```scala +scala> val g = spark.dseGraph(“complex”) + +scala> g.V.show(false) ++---------------+----------+----+---------+----------------------+----+----------+----------+------------------------+---------+---------+------------------------+---------+ +|id |~label |name|versions1|versions2 |udt1|udt2 |frozenList|frozenMap |frozenSet|list |map |set | ++---------------+----------+----+---------+----------------------+----+----------+----------+------------------------+---------+---------+------------------------+---------+ +|software:B#19 |software |B |[2, 3] |[[1, 2, 3], [1, 2, 3]]|null|null |null |null |null |null |null |null | +|person:A#10 |person |A |null |null |[B] |[B -> [C]]|null |null |null |null |null |null | +|collection:C#11|collection|C |null |null |null|null |[1, 2, 3] |[1 -> 1, 2 -> 2, 3 -> 3]|[1, 2, 3]|[1, 2, 3]|[1 -> 1, 2 -> 2, 3 -> 3]|[1, 2, 3]| ++---------------+----------+----+---------+----------------------+----+----------+----------+------------------------+---------+---------+------------------------+---------+ +``` + +Let’s look at some examples with the data frame constructed from scratch. Suppose we want to update the software vertex +in this example and change the `versions1` tuple values, we start by defining a data frame with `name` (partition key) and +the column of interest, `versions1` in this case. Notice we use a scala tuple to wrap the new values to be inserted. +```scala +scala> val df = Seq(("B", (20, "30"))).toDF("name", "versions1") + +scala> df.printSchema +root + |-- name: string (nullable = true) + |-- versions1: struct (nullable = true) + | |-- _1: integer (nullable = false) + | |-- _2: string (nullable = true) +``` + +We then create a data frame consisting of the properly formatted `id`, which is constructed from the `~label` and `id`, and the `versions1` field. +```scala +scala> val updateDF = df.select(g.idColumn("software", $"name") as "id", $"versions1") + + +scala> updateDF.show(false) ++-------------+---------+ +|id |versions1| ++-------------+---------+ +|software:B#19|[20, 30] | ++-------------+---------+ + +scala> g.updateVertices(updateDF) + +scala> g.V().show(false) ++---------------+----------+----+---------+----------------------+----+----------+----------+------------------------+---------+---------+------------------------+---------+ +|id |~label |name|versions1|versions2 |udt1|udt2 |frozenList|frozenMap |frozenSet|list |map |set | ++---------------+----------+----+---------+----------------------+----+----------+----------+------------------------+---------+---------+------------------------+---------+ +|software:B#19 |software |B |[20, 30] |[[1, 2, 3], [1, 2, 3]]|null|null |null |null |null |null |null |null | +|person:A#10 |person |A |null |null |[B] |[B -> [C]]|null |null |null |null |null |null | +|collection:C#11|collection|C |null |null |null|null |[1, 2, 3] |[1 -> 1, 2 -> 2, 3 -> 3]|[1, 2, 3]|[1, 2, 3]|[1 -> 1, 2 -> 2, 3 -> 3]|[1, 2, 3]| ++---------------+----------+----+---------+----------------------+----+----------+----------+------------------------+---------+---------+------------------------+---------+ +``` + +Now let’s update the `versions2` column, we start by reminding ourselves what the schema looks like. +```scala +scala> g.V.printSchema +root + |-- id: string (nullable = true) + |-- ~label: string (nullable = false) + |-- name: string (nullable = false) + |-- versions1: struct (nullable = true) + | |-- 0: integer (nullable = true) + | |-- 1: string (nullable = true) + |-- versions2: struct (nullable = true) + | |-- 0: array (nullable = true) + | | |-- element: integer (containsNull = true) + | |-- 1: array (nullable = true) + | | |-- element: string (containsNull = true) + |-- udt1: struct (nullable = true) + | |-- name: string (nullable = true) + |-- udt2: map (nullable = true) + | |-- key: string + | |-- value: struct (valueContainsNull = true) + | | |-- name: string (nullable = true) + |-- frozenList: array (nullable = true) + | |-- element: integer (containsNull = true) + |-- frozenMap: map (nullable = true) + | |-- key: integer + | |-- value: string (valueContainsNull = true) + |-- frozenSet: array (nullable = true) + | |-- element: integer (containsNull = true) + |-- list: array (nullable = true) + | |-- element: integer (containsNull = true) + |-- map: map (nullable = true) + | |-- key: integer + | |-- value: string (valueContainsNull = true) + |-- set: array (nullable = true) + | |-- element: integer (containsNull = true) +``` + +Okay so we need to construct a data frame that matches this expectation, this is how we do it. +```scala +scala> val df = Seq(("B", (Seq(100,200,300).toSet, Seq("100","200","300").toList))).toDF("name", "versions2") + +scala> df.printSchema +root + |-- name: string (nullable = true) + |-- versions2: struct (nullable = true) + | |-- _1: array (nullable = true) + | | |-- element: integer (containsNull = false) + | |-- _2: array (nullable = true) + | | |-- element: string (containsNull = true) +``` + +Now we simply massage it to properly format the `id` column as we’ve done before. +```scala +scala> val updateDF = df.select(g.idColumn("software", $"name") as "id", $"versions2") + +scala> updateDF.show(false) ++-------------+----------------------------------+ +|id |versions2 | ++-------------+----------------------------------+ +|software:B#19|[[100, 200, 300], [100, 200, 300]]| ++-------------+----------------------------------+ + +scala> g.updateVertices(updateDF) +``` + +As we can see the `versions2` column has been properly updated with the new values. +```scala +scala> g.V().show(false) ++---------------+----------+----+---------+----------------------------------+----+----------+----------+------------------------+---------+---------+------------------------+---------+ +|id |~label |name|versions1|versions2 |udt1|udt2 |frozenList|frozenMap |frozenSet|list |map |set | ++---------------+----------+----+---------+----------------------------------+----+----------+----------+------------------------+---------+---------+------------------------+---------+ +|software:B#19 |software |B |[20, 30] |[[100, 200, 300], [100, 200, 300]]|null|null |null |null |null |null |null |null | +|person:A#10 |person |A |null |null |[B] |[B -> [C]]|null |null |null |null |null |null | +|collection:C#11|collection|C |null |null |null|null |[1, 2, 3] |[1 -> 1, 2 -> 2, 3 -> 3]|[1, 2, 3]|[1, 2, 3]|[1 -> 1, 2 -> 2, 3 -> 3]|[1, 2, 3]| ++---------------+----------+----+---------+----------------------------------+----+----------+----------+------------------------+---------+---------+------------------------+---------+ +``` + +Confirmed, the data looks good outside of DataStax Graph Frames as expected. +```groovy +gremlin> g.V().hasLabel("software").valueMap(true) +==>{id=software:B#19, label=software, versions2=[({100,200,300},['100','200','300'])], versions1=[(20,'30')], name=[B]} +``` + +These same principles apply when updating other collection types. Here is an example for updating the map column. +```scala +scala> val df = Seq(("C", Map(10->"10", 20->"20", 30->"30"))).toDF("name", "map") + +scala> df.printSchema +root + |-- name: string (nullable = true) + |-- map: map (nullable = true) + | |-- key: integer + | |-- value: string (valueContainsNull = true) + + +scala> df.show(false) ++----+------------------------------+ +|name|map | ++----+------------------------------+ +|C |[10 -> 10, 20 -> 20, 30 -> 30]| ++----+------------------------------+ + + +scala> val updateDF = df.select(g.idColumn("collection", $"name") as "id", $"map") + +scala> updateDF.show(false) ++---------------+------------------------------+ +|id |map | ++---------------+------------------------------+ +|collection:C#11|[10 -> 10, 20 -> 20, 30 -> 30]| ++---------------+------------------------------+ + + +scala> g.updateVertices(updateDF) + +scala> g.V.show(false) ++---------------+----------+----+---------+----------------------------------+----+----------+----------+------------------------+---------+---------+------------------------------+---------+ +|id |~label |name|versions1|versions2 |udt1|udt2 |frozenList|frozenMap |frozenSet|list |map |set | ++---------------+----------+----+---------+----------------------------------+----+----------+----------+------------------------+---------+---------+------------------------------+---------+ +|software:B#19 |software |B |[20, 30] |[[100, 200, 300], [100, 200, 300]]|null|null |null |null |null |null |null |null | +|person:A#10 |person |A |null |null |[B] |[B -> [C]]|null |null |null |null |null |null | +|collection:C#11|collection|C |null |null |null|null |[1, 2, 3] |[1 -> 1, 2 -> 2, 3 -> 3]|[1, 2, 3]|[1, 2, 3]|[10 -> 10, 20 -> 20, 30 -> 30]|[1, 2, 3]| ++---------------+----------+----+---------+----------------------------------+----+----------+----------+------------------------+---------+---------+------------------------------+---------+ +``` + +Let’s look at one more example, updating the `udt2` column that is a map consisting of a struct. In this case we can use +a case class to accomplish this goal. +```scala +scala> case class udt2Value(name: String) + +scala> val df = Seq(("A", Map("Rocco"->udt2Value("likes tacos")))).toDF("name", "udt2") + +scala> val updateDF = df.select(g.idColumn("person", $"name") as "id", $"udt2") + +scala> updateDF.printSchema +root + |-- id: string (nullable = true) + |-- udt2: map (nullable = true) + | |-- key: string + | |-- value: struct (valueContainsNull = true) + | | |-- name: string (nullable = true) + +scala> updateDF.show(false) ++-----------+------------------------+ +|id |udt2 | ++-----------+------------------------+ +|person:A#10|[Rocco -> [likes tacos]]| ++-----------+------------------------+ + +scala> g.updateVertices(updateDF) + +scala> g.V().show(false) ++---------------+----------+----+---------+----------------------------------+----+------------------------+----------+------------------------+---------+---------+------------------------------+---------+ +|id |~label |name|versions1|versions2 |udt1|udt2 |frozenList|frozenMap |frozenSet|list |map |set | ++---------------+----------+----+---------+----------------------------------+----+------------------------+----------+------------------------+---------+---------+------------------------------+---------+ +|software:B#19 |software |B |[20, 30] |[[100, 200, 300], [100, 200, 300]]|null|null |null |null |null |null |null |null | +|person:A#10 |person |A |null |null |[B] |[Rocco -> [likes tacos]]|null |null |null |null |null |null | +|collection:C#11|collection|C |null |null |null|null |[1, 2, 3] |[1 -> 1, 2 -> 2, 3 -> 3]|[1, 2, 3]|[1, 2, 3]|[10 -> 10, 20 -> 20, 30 -> 30]|[1, 2, 3]| ++---------------+----------+----+---------+----------------------------------+----+------------------------+----------+------------------------+---------+---------+------------------------------+---------+ +``` + +##### Multi-edge updates should include UUIDs to ensure idempotent upserts +In Classic DataStax Graph, when updating edges users should provide a valid and unique UUID for the `id` column. Here we will +show two examples of using `updateEdges`, one in which we reuse an existing row’s data, and another that shows how to +explicitly set the `id` with a UUID. + +Suppose we start with the following graph schema, our examples will look at updates with the “lives” edge label. +```groovy +// truncated example for brevity +schema.propertyKey("nicknames").Text().multiple().create() +schema.propertyKey("reason").Text().single().create() +schema.propertyKey("age").Int().single().create() +schema.propertyKey("name").Text().single().create() +schema.propertyKey("date").Date().single().create() +schema.propertyKey("nicknames").properties("time", "date").add() + +schema.vertexLabel("location").properties("name").create() +schema.vertexLabel("god").properties("name", "age", "nicknames").create() +schema.vertexLabel("god").index("god_age_index").secondary().by("age").add() +schema.vertexLabel("god").index("god_name_index").secondary().by("name").add() + +schema.edgeLabel("lives").multiple().properties("reason").create() +schema.edgeLabel("lives").connection("god", "location").add() + +// Add data +Vertex neptune = graph.addVertex(T.label, "god", "name", "neptune", "age", 4500); +Vertex sea = graph.addVertex(T.label, "location", "name", "sea"); + +neptune.addEdge("lives", sea).property("reason", "loves waves"); +``` + +Here is what the edges table looks like from the DataStax Graph Frames perspective (truncated). Make note of the +automatically-generated UUID generated for this edge, we will refer to this later. +```java +DseGraphFrame gf = DseGraphFrameBuilder.dseGraph(keyspace, spark); +gf.E().df().show(false); + ++------------------------+-------------------------+-------+------------------------------------+-----------------------+----------------------------------+-------------------+ +|src |dst |~label |id |time |name |reason | ++------------------------+-------------------------+-------+------------------------------------+-----------------------+----------------------------------+-------------------+ +|god:RnS4AAAAAAAAAAAE |location:RnS4AAAAAAAAAAAJ|lives |f695a6b0-4500-11e9-8e88-fb68397e4bea|null |null |loves waves | ++------------------------+-------------------------+-------+------------------------------------+-----------------------+----------------------------------+-------------------+ +``` + +###### Using an existing row’s data + +This is how we grab edge(s) that have a `reason` column set to “loves waves”, then overwrite this setting to “New”. + +```java +DseGraphFrame gf = DseGraphFrameBuilder.dseGraph(keyspace, spark); +Dataset u = gf.gf().edges().filter("reason = 'loves waves'").drop("time").drop("reason").drop("name").withColumn("reason", functions.lit("New")); +``` + +This gives us the following dataset that will be used to update the edge(s). Notice that because we are using an existing +row and simply reinserting it with a new reason, we get the unique id for free. +```java +u.show(false); ++--------------------+-------------------------+------+------------------------------------+------+ +|src |dst |~label|id |reason| ++--------------------+-------------------------+------+------------------------------------+------+ +|god:RnS4AAAAAAAAAAAE|location:RnS4AAAAAAAAAAAJ|lives |f695a6b0-4500-11e9-8e88-fb68397e4bea|New | ++--------------------+-------------------------+------+------------------------------------+------+ +``` + +Updating the edge(s) can be accomplished by simply calling the following command +```java +gf.updateEdges(u); +``` + +As expected we see the edge with id "f695a6b0-4500-11e9-8e88-fb68397e4bea" has the reason set to “New”. +```java +gf.E().df().show(false); ++------------------------+-------------------------+-------+------------------------------------+-----------------------+----------------------------------+-------------------+ +|src |dst |~label |id |time |name |reason | ++------------------------+-------------------------+-------+------------------------------------+-----------------------+----------------------------------+-------------------+ +|god:RnS4AAAAAAAAAAAE |location:RnS4AAAAAAAAAAAJ|lives |f695a6b0-4500-11e9-8e88-fb68397e4bea|null |null |New | ++------------------------+-------------------------+-------+------------------------------------+-----------------------+----------------------------------+-------------------+ +``` + +###### Explicitly setting the “id” column +That was a simple example, but how does one extract this `id` field and explicitly use it when constructing the dataset +when updating edges? It’s actually very simple. + +Let’s start with the dataset used in the previous example +```java +DseGraphFrame gf = DseGraphFrameBuilder.dseGraph(keyspace, spark); +Dataset u = gf.gf().edges().filter("reason = 'loves waves'").drop("time").drop("reason").drop("name").withColumn("reason", functions.lit("New")); + +u.show(false); ++--------------------+-------------------------+------+------------------------------------+------+ +|src |dst |~label|id |reason| ++--------------------+-------------------------+------+------------------------------------+------+ +|god:RnS4AAAAAAAAAAAE|location:RnS4AAAAAAAAAAAJ|lives |f695a6b0-4500-11e9-8e88-fb68397e4bea|New | ++--------------------+-------------------------+------+------------------------------------+------+ +``` + +Now to extract the unique id, simply do this +```java +String newUUID = u.collectAsList().get(0).getString(3); +``` + +In this case we know we have only one row so we use `get(0)`, and we know the `id` column is at index 3 so we use `getString(3)`. + +Now to explicitly set the id field and update the edge we would do this. +```java +u = gf.gf().edges().filter("reason = 'New'").drop(“id”).drop("time").drop("reason").drop("name") + .withColumn("reason", functions.lit("New1")) + .withColumn("id", functions.lit(newUUID)); + +gf.updateEdges(u); +``` + +For demonstration purposes, we explicitly set the `id` columns with the UUID previously saved and inserted a new reason for this row. +```java +gf.E().df().show(false); ++------------------------+-------------------------+-------+------------------------------------+-----------------------+----------------------------------+-------------------+ +|src |dst |~label |id |time |name |reason | ++------------------------+-------------------------+-------+------------------------------------+-----------------------+----------------------------------+-------------------+ +|god:RnS4AAAAAAAAAAAE |location:RnS4AAAAAAAAAAAJ|lives |f695a6b0-4500-11e9-8e88-fb68397e4bea|null |null |New1 | ++------------------------+-------------------------+-------+------------------------------------+-----------------------+----------------------------------+-------------------+ +``` + +In general, it’s important to note that when updating edges you must include the `id` column in the dataset for the _existing edges to be updated_. +If a user omits the `id` column and instead only supplies the `src`, `dst`, and `~label`, they will end up duplicating +edges with auto-generated IDs. + +##### Key order matters when using the idColumn() method +When using the `idColumn()` method for vertices that have multiple key columns it is important to pass the key columns +into the function in the same order in which they are defined in the schema. + +Suppose we have the following name vertex label schema +```groovy +schema.vertexLabel("name") + .partitionKey("internal_party_id") + .clusteringKey( + "prefx_nm", + "first_nm", + "mdl_nm", + "last_nm", + "sufx_nm", + "name_line_desc" +) +``` + +Be careful when passing the keys to the `idColumn()` method, the order must match that defined in the schema. Here is an example of the correct way to pass in these keys. +```scala +scala> val hasNameEdges = nameVertices + .drop(col("~label")) + .withColumn("src", nameVertices.col("partyId")) + .withColumn("dst", g.idColumn( + lit("name"), + nameVertices.col("internal_party_id"), + nameVertices.col("prefx_nm"), + nameVertices.col("first_nm"), + nameVertices.col("mdl_nm"), + nameVertices.col("last_nm"), + nameVertices.col("sufx_nm"), + nameVertices.col("name_line_desc") + )) + .withColumn("~label", lit("has_name")) + .drop(col("internal_party_id")) + .drop(col("partyId")) + .drop(col("first_nm")) + .drop(col("last_nm")) + .drop(col("mdl_nm")) + .drop(col("name_line_desc")) + .drop(col("prefx_nm")) + .drop(col("sufx_nm")) + +scala> g.updateEdges(hasNameEdges) +``` + +The new API for updating single labels was introduced to address this issue and make the user experience more frictionless, +see [updateVertices](#updatevertices) and [updateEdges](#updateedges) for more details. + +#### Tuning Considerations for Loading Big Graphs +##### Spark Cassandra Connector tuning parameters still apply with DataStax Graph Frames +To increase write performance during DataStax Graph Frames (DGF) bulk loading, remember that our existing Spark Cassandra Connector +tuning parameters still apply: [Setting Spark Cassandra Connector Specific Properties](https://docs.datastax.com/en/dse/6.7/dse-dev/datastax_enterprise/spark/sparkCassandraProperties.html?hl=setting%2Cspark%2Ccassandra%2Cconnector-specific%2Cproperties) + +For example, `spark.cassandra.output.concurrent.writes` has been found to be one of the most intuitive and effective parameters +to play with during load testing. Other parameters such as `spark.cassandra.output.throughput_mb_per_sec` can be very helpful as well, +especially in cases where one expects a long insertion workload it may be wise to down-tune this appropriately to avoid +overwhelming the database cluster. + +The `spark.cassandra.connection.keepAliveMs` may also be useful in scenarios with long running insertion workloads where +connections may experience longer than expected periods of inactivity, a potential side-effect of periodic delays while +processing insertions/updates on the server. + +Here are examples of using these parameters: +``` +dse spark-submit \ + --conf "spark.cassandra.output.concurrent.writes=100" \ + --conf "spark.cassandra.connection.keepAliveMS=120000" \ + --executor-memory=8g \ + --class com.datastax.DataImport target/data-import-1.0-SNAPSHOT.jar \ + newapi +``` + +##### Avoid over tuning your application on a small dataset +Be careful when tuning with a small dataset, very likely parameters tuned for short insertion workload will not behave +similarly for longer more intensive workloads. A longer sustained insertion workload will lead to more data and more +severe effects from background tasks such as memtable flushing, compaction, query routing, etc. In short, I recommend an +incremental approach when loading data. Try loading say 10-20% of the data, making note of parameters, cluster size, +and overall node health during the process (e.g. look out for obvious things like timeout exceptions, etc). + +Also, increasing the cluster size can serve as an effective strategy in reducing individual node stress and improving +overall ingestion performance. Again there is not a one-size-fits-all solution here, but an incremental approach with +reasonably chosen tuning parameters and environment setup is a good approach. diff --git a/dsgraph/graph-docs/GettingStarted.md b/dsgraph/graph-docs/GettingStarted.md new file mode 100644 index 0000000..78d0faf --- /dev/null +++ b/dsgraph/graph-docs/GettingStarted.md @@ -0,0 +1,112 @@ +# Getting started with Native Engine Graph + +## Setup +Create a graph. +``` +system.graph('crew').create() +``` +If using gremlin console then alias the graph. +``` +:remote config alias g crew.g +``` +## Add schema +``` +schema.vertexLabel('person'). + partitionBy('name', Text). + property('age', Int). + create() + +schema.vertexLabel('software'). + partitionBy('name', Text). + property('version', Int). + property('lang', Text). + create() + +schema.edgeLabel('created'). + from('person').to('software'). + property('weight', Double). + create() +``` +## Add some data +``` +g.addV('person'). + property('name', 'bob'). + property('age', 30). + as('bob'). + addV('software'). + property('name', 'studio'). + property('lang', 'java'). + property('version', 1). + as('studio'). + addE('created').from('bob').to('studio'). + property('weight', 0.8). + iterate() +``` +Warning! Remember that a scripts with multiple traversals will only iterate the last traversal automatically. It +is important to end traversals with `.iterate()` to force iteration in this situation. +``` +g.addV('person'). + property('name', 'bob'). + property('age', 30). + iterate() //Required `bob` will NOT be inserted +g.addV('software'). + property('name', 'studio'). + property('lang', 'java'). + property('version', 1). + iterate() +``` +It is generally good practice to add `.iterate()` if you do not need the results to be returned to you. This saves the +overhead of sending the results back to the client and improves performance. + +## Query your graph +Query for `bob`. +``` +g.V().hasLabel('person').has('name', 'bob') +``` +``` +==>v[person:bob#64] +``` + +Query for software that `bob` created. +``` +g.V().hasLabel('person').has('name', 'bob').out('created') +``` +``` +==>v[software:studio#37] +``` +Query for software `studio`. +``` +g.V().hasLabel('software').has('name', 'studio') +``` +``` +==>v[software:studio#37] +``` + +## Perform a query that requires the existence of an index +Query for creator of `studio`. This will throw an error as traversal from `software` to `person` will require an index +``` +g.V().hasLabel('software').has('name', 'studio').in() +``` +``` +One or more indexes are required to execute the traversal: g.V().hasLabel("software").has("name","studio").in() +Failed step: DseVertexStep(__.in()) +CQL execution: No table or view could satisfy the query 'SELECT * FROM crew.person__created__software WHERE software_name = ?' +The output of 'schema.indexFor().analyze()' suggests the following indexes could be created to allow execution: + +schema.edgeLabel('created').from('person').to('software').materializedView('person__created__software_by_software_name').ifNotExists().partitionBy(IN, 'name').clusterBy(OUT, 'name', Asc).create() + +Alternatively consider using: +g.with('ignore-unindexed') to ignore unindexed traversal. Your results may be incomplete. +g.with('allow-filtering') to allow filtering. This may have performance implications. +``` +Create the required index as specified in the error message. +``` +schema.edgeLabel('created').from('person').to('software').materializedView('person__created__software_by_software_name').ifNotExists().partitionBy(IN, 'name').clusterBy(OUT, 'name', Asc).create() +``` +Run the query again. +``` +g.V().hasLabel('software').has('name', 'studio').in() +``` +``` +==>v[person:bob#64] +``` \ No newline at end of file diff --git a/dsgraph/graph-docs/GraphBinaryDSETypes.md b/dsgraph/graph-docs/GraphBinaryDSETypes.md new file mode 100644 index 0000000..1ba56b2 --- /dev/null +++ b/dsgraph/graph-docs/GraphBinaryDSETypes.md @@ -0,0 +1,82 @@ +# GraphBinary DSE specific types. + +DSE 6.8 comes with support for GraphBinary from the TinkerPop drivers and the DSE drivers. DSE has some additional types +from the standard GraphBinary set of supported types. These types are implemented via Custom types from the GraphBinary +protocol specification. + +## Types + +### CQL Duration + +Custom Type name: `"driver.core.Duration"` + +Format: `{months}{days}{nanoseconds}` + +Where: + +* `months` is a 4 bytes `Int` +* `days` is a 4 bytes `Int` +* `nanoseconds` is an 8 bytes `Long` + +### Geo Point / Geo LineString / Geo Polygon + +Custom Type name: `"driver.dse.geometry.Point"` / `"driver.dse.geometry.LineString"` / `"driver.dse.geometry.Polygon"` + +Format: `{wkb}` + +Where: + +* `wkb` is the bytes representing the Geo type encoded as a WKB + +### Geo Distance + +Custom Type name: `"driver.dse.geometry.Distance"` + +Format: `{center}{radius}` + +Where: + +* `center` is encoded as a `Geo Point` +* `radius` is an 8 bytes `Double` + +### EditDistance + +Custom Type name: `"driver.dse.search.EditDistance"` + +Format: `{distance}{query}` + +Where: + +* `distance` is a 4 bytes `Int` of the predicate's distance +* `query` is a `String` of the predicate's query + +### Pair + +Custom Type name: `"org.javatuples.Pair"` + +Format: `{item1}{item2}` + +Where: + +* `item1` is a fully qualified value composed of `{type_code}{type_info}{value_flag}{value}` +* `item2` is a fully qualified value composed of `{type_code}{type_info}{value_flag}{value}` + +### TupleValue / UDT Value + +Custom Type name: `"driver.core.TupleValue"` / `"driver.core.UDTValue"` + +Format: `{type_spec}{value_bytes}` + +Where: + +* `type_spec` is a full CQL type specification. For a UDT the specification for the type is defined +here: https://github.com/apache/cassandra/blob/cassandra-3.11.0/doc/native_protocol_v5.spec#L641 and for Tuples the +specification is defined here: https://github.com/apache/cassandra/blob/cassandra-3.11.0/doc/native_protocol_v5.spec#L653 +* `value_bytes` is the values of the fields of the Tuple or UDT, encoded directly into the CQL native protocol format. +See CQL protocol specification for more information on the format of these values. + + + + + + diff --git a/dsgraph/graph-docs/IndexAnalyzer.md b/dsgraph/graph-docs/IndexAnalyzer.md new file mode 100644 index 0000000..5a02237 --- /dev/null +++ b/dsgraph/graph-docs/IndexAnalyzer.md @@ -0,0 +1,170 @@ +# Index Analyzer + +Native Engine graphs benefits from automatic index analysis either explicitly via the schema API or via error messages. + +The index analyzer has the ability to figure out what indexes a given traversal requires. It will propose those necessary indexes and the user has the option to either create those indexes manually (through copy/paste) or automatically (through `.apply()`). + + +## How to determine what indexes a provided Traversal requires? +In order to let the index analyzer figure out what indexes a particular traversal requires, a user needs to execute `schema.indexFor().analyze()` as shown below: + +``` +gremlin> schema.indexFor(g.V().has("age", 23)).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('person').materializedView('person_by_age').ifNotExists().partitionBy('age').clusterBy('firstname', Asc).clusterBy('lastname', Asc).create() +``` + +The index analyzer will also indicate in case a traversal can be fulfilled by existing indexes: +``` +gremlin> schema.indexFor(g.V().has("name", "x")).analyze() +==>Traversal can be satisfied by existing indexes +``` + +## How to automatically create an index for a provided Traversal? + +This can be done through `schema.indexFor().apply()`. It is perfectly valid to skip `schema.indexFor().analyze()` +and execute `.apply()` directly. The `.apply()` step will indicate what indexes are being created. + +``` +gremlin> schema.indexFor(g.V().has("age", 23)).apply() +==>Creating the following indexes: +schema.vertexLabel('person').materializedView('person_by_age').ifNotExists().partitionBy('age').clusterBy('firstname', Asc).clusterBy('lastname', Asc).create() +OK +``` + +## What type of indexes can the Index Analyzer create? + +Depending on the particular traversal, the index analyzer can suggest the creation of one or more indexes of the following types: + +* a Materialized View for predicates that are not search-specific or specific to CQL collections +* a Search Index for specific predicates (e.g. `token` / `regex` / `phrase` / `neq` / ...) that can only be fulfilled by a search index +* a Secondary index for specific predicates (e.g. `contains(x)` / `containsKey(x)` / `containsValue(x)` / `entryEq(x, y)`) against CQL collections (Lists/Sets/Maps) + +## When are Materialized Views created? + +A Materialized View will generally be created if predicates in the traversal are not specific to CQL collections and are not search-specific. + +The below example filters on `phone` and orders by `age`. Since this can be fulfilled by a Materialized View, the output will be: +``` +gremlin> schema.indexFor(g.V().has("phone", "123-456-789").order().by("age", desc)).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('person').materializedView('person_by_phone_age_Desc').ifNotExists().partitionBy('phone').clusterBy('age', Desc).clusterBy('firstname', Asc).clusterBy('lastname', Asc).create() +``` + +## When are Search Indexes created? + +The index analyzer will pick a Search Index if the predicate is Search-specific, such as `token` / `tokenPrefix` / `tokenRegex` / `tokenFuzzy` / `phrase` / `regex` / `prefix` / `fuzzy`. +Additional predicates that require a Search Index are Geo predicates (`inside` / `insideCartesian`) and `neq` / `without`. + +The below example uses the predicate `regex` and filters on `age` and so the index analyzer will suggest a Search Index: +``` +gremlin> schema.indexFor(g.V().has('name', Search.regex('Alan')).has('age', gt(30))).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('p').searchIndex().ifNotExists().by('name').by('age').create() +``` + +A slightly more complicated example that will result in the suggestion of a Search Index due to using the `regex` / `prefix` predicates: +``` +gremlin> schema.indexFor(g.V().has('title', Search.regex('dse')).has('lang', Search.prefix('jav')).order().by('url')).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('software').searchIndex().ifNotExists().by('title').by('lang').by('url').create() +``` + +## When are Secondary Indexes created? + +The index analyzer will pick a Secondary Index for predicates (`contains(x)` / `containsKey(x)` / `containsValue(x)` / `entryEq(x, y)`) against CQL collections, such as Lists/Sets/Maps. + +The below example suggests a Secondary Index when filtering data in a **list**: +``` +gremlin> schema.indexFor(g.V().has("list", contains(23))).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('collections').secondaryIndex('collections_2i_by_list').ifNotExists().by('list').indexValues().create() +``` + +The same applies for queries against a **set**: +``` +gremlin> schema.indexFor(g.V().has("set", contains(45))).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('collections').secondaryIndex('collections_2i_by_set').ifNotExists().by('set').indexValues().create() +``` + +The below examples show usage of the **map** predicates `containsKey(x)` / `containsValue(x)` / `entryEq(x, y)` and the suggested indexes: +``` +gremlin> schema.indexFor(g.V().has("map", containsKey(45))).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('collections').secondaryIndex('collections_2i_by_map1').ifNotExists().by('map').indexKeys().create() + + +gremlin> schema.indexFor(g.V().has("map", containsValue("some item"))).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('collections').secondaryIndex('collections_2i_by_map2').ifNotExists().by('map').indexValues().create() + + +gremlin> schema.indexFor(g.V().has("map", entryEq(45, "some item"))).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('collections').secondaryIndex('collections_2i_by_map3').ifNotExists().by('map').indexEntries().create() +``` + +## When does the Index Analyzer create multiple indexes? + +This is generally the case when the traversal will hit more than one table (e.g. when we're not filtering on a particular label). +Given the traversal `g.V().has("list", contains(23))` and the three vertex labels shown below will require three separate indexes. + +``` +gremlin> schema.describe() +==>schema.vertexLabel('a').ifNotExists().partitionBy('id', Int).property('age', Int).property('name', Varchar).property('list', listOf(Int)).create() +schema.vertexLabel('b').ifNotExists().partitionBy('id', Int).property('age', Int).property('name', Varchar).property('list', listOf(Int)).create() +schema.vertexLabel('c').ifNotExists().partitionBy('id', Int).property('age', Int).property('name', Varchar).property('list', listOf(Int)).create() +``` + +``` +gremlin> schema.indexFor(g.V().has("list", contains(23))).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('c').secondaryIndex('c_2i_by_list').ifNotExists().by('list').indexValues().create() +schema.vertexLabel('a').secondaryIndex('a_2i_by_list').ifNotExists().by('list').indexValues().create() +schema.vertexLabel('b').secondaryIndex('b_2i_by_list').ifNotExists().by('list').indexValues().create() +``` + +The traversal `g.V().has("list", contains(23)).has("age", 23)` would require a Secondary Index (due to filtering on a CQL collection) and a Materialized View for each table. +Since no vertex label filtering is performed, the index analyzer will suggest six indexes in total: +``` +gremlin> schema.indexFor(g.V().has("list", contains(23)).has("age", 23)).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('c').materializedView('c_by_age').ifNotExists().partitionBy('age').clusterBy('id', Asc).create() +schema.vertexLabel('c').secondaryIndex('c_2i_by_list').ifNotExists().by('list').indexValues().create() +schema.vertexLabel('a').materializedView('a_by_age').ifNotExists().partitionBy('age').clusterBy('id', Asc).create() +schema.vertexLabel('a').secondaryIndex('a_2i_by_list').ifNotExists().by('list').indexValues().create() +schema.vertexLabel('b').materializedView('b_by_age').ifNotExists().partitionBy('age').clusterBy('id', Asc).create() +schema.vertexLabel('b').secondaryIndex('b_2i_by_list').ifNotExists().by('list').indexValues().create() +``` + +The traversal `g.V().has("list", contains(23)).has("age", 23).has("name", Search.regex(".*ohn"))` would also require two indexes per table. +A Secondary Index (due to filtering on a CQL collection) and a search index (due to `regex`). The index analyzer will not create a Materialized View for `age` +as was shown in the previous example, but rather use the search index for that: + +``` +gremlin> schema.indexFor(g.V().has("list", contains(23)).has("age", 23).has("name", Search.regex(".*ohn"))).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('c').searchIndex().ifNotExists().by('age').by('list').by('name').create() +schema.vertexLabel('c').secondaryIndex('c_2i_by_list').ifNotExists().by('list').indexValues().create() +schema.vertexLabel('a').searchIndex().ifNotExists().by('age').by('list').by('name').create() +schema.vertexLabel('a').secondaryIndex('a_2i_by_list').ifNotExists().by('list').indexValues().create() +schema.vertexLabel('b').searchIndex().ifNotExists().by('age').by('list').by('name').create() +schema.vertexLabel('b').secondaryIndex('b_2i_by_list').ifNotExists().by('list').indexValues().create() +``` + +--- + +**NOTE** + +It is generally always recommended to use an element label when filtering as otherwise multiple CQL statements will be executed and data might get filtered in-memory by Tinkerpop and not by C* + +--- + +When filtering on the vertex label `a` in `g.V().hasLabel("a").has("list", contains(23)).has("age", 23).has("name", Search.regex(".*ohn"))`, the index analyzer will fail to suggest any indexes, because it tries to leverage a search index for both conditions: +``` +gremlin> schema.indexFor(g.V().hasLabel("a").has("list", contains(23)).has("age", 23).has("name", "hans")).analyze() +==>Failed to suggest indexes for traversal. Some steps in the traversal might not be supported yet. +``` + +This is currently a limitation that will be addressed in a future release. \ No newline at end of file diff --git a/dsgraph/graph-docs/ProtocolCompatibility.md b/dsgraph/graph-docs/ProtocolCompatibility.md new file mode 100644 index 0000000..de6827d --- /dev/null +++ b/dsgraph/graph-docs/ProtocolCompatibility.md @@ -0,0 +1,15 @@ +# DataStax Graph 6.8 Protocols compatibility + +DataStax Graph 6.8 will make some protocol compatibility checks regarding the type of graph targeted, the +type of request sent and the protocol set for this request. + +Here's the compatibility scheme: + +* _GraphSON 1_, _Gryo 1_: only **scripts** allowed, only for **Classic** graph +* _GraphSON 2_: **Script** and **bytecode** queries allowed, only for **Classic** graph +* _GraphSON 3_, _Gryo 3_: **Scripts** and **bytecode** allowed for **Native** + graph +* System requests without a Graph name defined / without alias (system queries) do not require a +specific protocol. + +In general the user should allow the DSE drivers to automatically select the correct protocol. \ No newline at end of file diff --git a/dsgraph/graph-docs/README.md b/dsgraph/graph-docs/README.md new file mode 100644 index 0000000..246c04c --- /dev/null +++ b/dsgraph/graph-docs/README.md @@ -0,0 +1,52 @@ +# DataStax Graph + +## What is DataStax Graph? +DataStax Graph is a graph API that sits on top of the DSE stack. +It provides unified access to a subset of DSE features, and also allows complex queries via the Gremlin query language. + +## Why Native Engine? +The Native Engine initiative was started to bring enhanced usability and performance to DataStax Graph by: + +1. Aligning the data model with regular C* tables - Many features from DSE work more naturally with Native Engine. +In addition, users that have existing C* knowledge will have a good chance of understanding how their data model will +affect performance of their queries. +2. Enhanced usability - We have taken on board feedback from users and support, in particular +reducing or removing configuration or features that confused, promoted bad practise or made support difficult. +Messaging for errors and profile output has been greatly improved. +3. Enhanced performance - Read and write path have been significantly simplified allowing for greater performance. + +Existing graph behaviour has been retained by splitting in to two engines: `Classic` and `Native`: + +* `Classic` is maintained to allow backward compatibility for existing users. +* `Native` should be used for all new graphs. + +## Overview + +[What's new in DataStax Graph?](WhatsNewInDataStaxGraph.md) + +[Deprecated features](DeprecatedFeatures.md) + +[Classic to Native migration](ClassicToNativeGraphMigration.md) + +[Getting started](GettingStarted.md) + +## Schema + +[System and schema API](SystemAndSchemaAPI.md) + +[CQL graph extensions](CQLGraphExtensions.md) + +## Traversal execution and indexing + +[Traversal options](TraversalOptions.md) + +[Index analyzer](IndexAnalyzer.md) + +## Analytics + +[DataStax Graph Frames](DseGraphFrames.md) + +## Drivers + +[Protocol compatibility](ProtocolCompatibility.md) + diff --git a/dsgraph/graph-docs/SystemAndSchemaAPI.md b/dsgraph/graph-docs/SystemAndSchemaAPI.md new file mode 100644 index 0000000..0a858fa --- /dev/null +++ b/dsgraph/graph-docs/SystemAndSchemaAPI.md @@ -0,0 +1,900 @@ +# DataStax Graph - System API Usage Examples + +## Native Engine + +#### Creating a Native Engine Graph + +Native Engine graphs are created with `SimpleStrategy` by default: +``` +system.graph("test_native"). + ifNotExists(). + create() +``` + +Or you can use `NetworkTopologyStrategy` by setting the replication explicitly. +Make sure that the `DC_NAME` matches your DC name as listed by `nodetool status`. +``` +system.graph("test_native"). + ifNotExists(). + withReplication("{'class': 'NetworkTopologyStrategy', '': }"). + create() +``` + +Native Engine graphs are created with `durableWrites` set to `true` by default. +If necessary, user can specify the setting when creating the graph: +``` +system.graph("test_native"). + ifNotExists(). + withReplication("{'class': 'NetworkTopologyStrategy', '': }"). + andDurableWrites(false). + create() +``` +However, it is **NOT recommended** to set `durableWrites` to `false`. + +#### Convert an existing non-graph keyspace to a Native Engine Graph +``` +system.graph("testExisting"). + fromExistingKeyspace(). + create() +``` + +#### Dropping/Truncating a Native Engine Graph +`drop()` will drop the underlying keyspace and the tables, whereas `truncate()` will only delete their contents. +``` +system.graph("test_native"). + ifExists(). + drop() +``` + +``` +system.graph("test_native"). + ifExists(). + truncate() +``` + +## Classic Engine +#### Creating a Classic Engine Graph +``` +system.graph('testClassic'). + replication("{'class' : 'NetworkTopologyStrategy', '' : }"). + systemReplication("{'class' : 'NetworkTopologyStrategy', '' : }"). + option('graph.schema_mode'). + set('Production'). + engine(Classic). + create() +``` + +#### Dropping a Classic Engine Graph +``` +system.graph('testClassic'). + ifExists(). + drop() +``` + +## Retrieving information from a Graph + +#### All available Graphs + +``` +gremlin> system.graphs() +==>test_native +==>testClassic +``` + +#### Verbose information for all Graphs (Name, Engine, RF) + +``` +gremlin> system.list() +==>Name: test_native | Engine: Native | Replication: {SearchGraph=1, class=org.apache.cassandra.locator.NetworkTopologyStrategy} +==>Name: testClassic | Engine: Classic | Replication: {SearchGraph=1, class=org.apache.cassandra.locator.NetworkTopologyStrategy} +``` + +#### Schema definition of a particular Graph + +``` +gremlin> system.graph("test_native").describe() +==>system.graph('test_native').ifNotExists().withReplication("{'class': 'org.apache.cassandra.locator.NetworkTopologyStrategy', 'SearchGraph': '1'}").andDurableWrites(true).nativeEngine().create() +``` + + + +# Native Engine Graph - Schema API Usage Examples +All examples are for `Native` graphs. +Remember that once you have created your graph you will need to alias it in the console: +``` +:remote config alias g test_native.g +``` + +## Creating a Vertex/Edge Label +#### Creating a Vertex Label + +The following example creates a vertex label `person`, having a partition key on `name` and `ssn`, using `age` as a clustering column. The underlying table will be named `person`. +``` +schema.vertexLabel('person'). + partitionBy('name', Text). + partitionBy('ssn', Text). + clusterBy('age', Int). + property('address', Text). + property('coffeePerDay', Int). + create() +``` + +And here we create a vertex label `software`, using `name` as partition key, `version` and `lang` as clustering columns. `temp` ends up being a regular and `static_property` a static column. +``` +schema.vertexLabel('software'). + partitionBy('name', Text). + clusterBy('version', Int). + clusterBy('lang', Text). + property('temp', Text). + property('static_property', Text, Static). + create() +``` + +#### Creating a Vertex Label with a custom table name +When creating a vertex label, the default table name will be equal to the label name. It is possible to override the default table name by using `.tableName()` as shown in the below example: +``` +schema.vertexLabel('personCustom'). + tableName('personTable'). + partitionBy('name', Text). + partitionBy('ssn', Text). + clusterBy('age', Int). + property('address', Text). + property('coffeePerDay', Int). + create() +``` + +#### Creating a Vertex Label from an existing Table +Assuming we have the following existing non-vertex/edge table: +``` +CREATE TABLE test_native.book_table (title varchar, isbn int, pages int, +PRIMARY KEY ((title), isbn)) +WITH CLUSTERING ORDER BY (isbn ASC); +``` +We can convert it into a vertex label using: +``` +schema.vertexLabel('book'). + fromExistingTable('book_table'). + create() +``` + +#### Creating an Edge Label + +The following example creates an edge label `created` that connects `person` to `software`. The underlying table will be named `person__created__software`. This will automatically add required mapping columns from `person` & `software` to the edge table. Since `person` is the `OUT` vertex, all of its primary key columns will be prefixed `person_` in the edge table and all of the primary key columns of `software` will be prefixed with `software_`. + +``` +schema.edgeLabel('created'). + from('person').to('software'). + property('weight', Double). + create() +``` + +Eventually you will end up with the following CQL Schema: +``` +CREATE TABLE test_native.person__created__software ( + person_name text, + person_ssn text, + person_age int, + software_name text, + software_version int, + software_lang text, + weight double, + PRIMARY KEY ((person_name, person_ssn), person_age, software_name, software_version, software_lang) +) WITH CLUSTERING ORDER BY (person_age ASC, software_name ASC, software_version ASC, software_lang ASC) + AND EDGE LABEL created + FROM person((person_name, person_ssn), person_age) + TO software(software_name, software_version, software_lang); +``` +#### Creating an Edge label with a custom table name +When creating an edge label, the default table name will be `____`. It is possible to override the default table name by using `.tableName()` as shown in the below example: + +``` +schema.edgeLabel('createdCustom'). + tableName('createdTable'). + ifNotExists(). + from('person').to('software'). + property('weight', Double). + create() +``` + +#### Creating an Edge Label from an existing Table +Assuming we have the following existing non-vertex/edge table: +``` +CREATE TABLE writes_table (person_name text, ssn text, age int, book_name text, isbn int, date date, + PRIMARY KEY ((person_name), ssn, age, book_name, isbn)) + WITH CLUSTERING ORDER BY (ssn ASC, age ASC, book_name ASC, isbn ASC); +``` + +We can convert it into an edge label that connects `person` to `book`, using: +``` +schema.edgeLabel('writes'). + fromExistingTable('writes_table'). + from('person'). + mappingProperty('person_name'). + mappingProperty('ssn'). + mappingProperty('age'). + to('book'). + mappingProperty('book_name'). + mappingProperty('isbn'). + create() +``` +Note that while the column names are not required to match, the data type and the number of mapping properties must match the data type and the number of primary key columns on the vertex label in order. +For example, vertex label `person` has three primary key columns, `name(text)`, `ssn(text)`, `age(int)`, table `writes_table` has five primary key columns, +which three of them `person_name(text)`, `ssn(text)`, `age(int)` are matched in corresponding order in the above definition. + +#### Creating an Edge Label with specified mapping columns + +Given the vertex labels: +``` +schema.vertexLabel('person'). + partitionBy('name', Text). + partitionBy('age', Int). + clusterBy('year', Int). + property('coffeePerDay', Int). + create() + +schema.vertexLabel('software'). + partitionBy('name', Text). + clusterBy('year', Int). + clusterBy('license', Text). + create() +``` + +You can specify how the mapping columns are named in the edge table: +``` +schema.edgeLabel('created'). + from('person').to('software'). + partitionBy(OUT, 'name', 'person_name'). + partitionBy(OUT, 'age', 'person_age'). + partitionBy('creation_date', Text). + clusterBy(OUT, 'year', 'person_year'). + clusterBy(IN, 'name', 'software_name'). + clusterBy(IN, 'year', 'software_year'). + clusterBy(IN, 'license', 'software_license'). + clusterBy('creation_year', Int, Desc). + create() +``` + +Which will lead to the below CQL Schema: +``` +CREATE TABLE test_native.person__created__software ( + person_name text, + person_age int, + creation_date text, + person_year int, + software_name text, + software_year int, + software_license text, + creation_year int, + PRIMARY KEY ((person_name, person_age, creation_date), person_year, software_name, software_year, software_license, creation_year) +) WITH CLUSTERING ORDER BY (person_year ASC, software_name ASC, software_year ASC, software_license ASC, creation_year DESC) + AND EDGE LABEL created + FROM person((person_name, person_age), person_year) + TO software(software_name, software_year, software_license); + +``` + +Generally the pattern for specifying the mapping columns is `partitionBy(direction,sourceProperty,targetProperty)` / `clusterBy(direction,sourceProperty,targetProperty)` where: + +* `direction` can be only `OUT` for `partitionBy(..)` and `IN` / `OUT` for `clusterBy(..)` +* `sourceProperty` is the name of the property/column from the source vertex label table +* `targetProperty` is the name of the mapping property/column in the edge label table. If `targetProperty` is not specified, then it will default to `direction.name().toLowerCase() + "_" + sourceProperty` + + +## Dropping a Vertex/Edge Label + +#### Dropping a Vertex Label +This will drop the table of the vertex label `software` if it exists and associated edge tables. So e.g. if there is `person->created->software` / `software->generated->software`, then there will be **3** delete statements (one for the table behind `software` and two for the associated edge tables). +``` +schema.vertexLabel('software'). + ifExists(). + drop() +``` + +Note that any connected edge labels will also be dropped. For instance `person-created->software` would be dropped, but + `person-created->building` would remain. + + +#### Dropping an Edge Label +This will drop the table of the edge label `created` if it exists. Please note that this will drop **all** connections where this particular label is being used. So e.g. if there is a `person->created->software` and a `software->created->software` connection, then both tables will be dropped. +``` +schema.edgeLabel('created'). + ifExists(). + drop() +``` + +## Dropping a Vertex/Edge Label's Metadata + +#### Dropping a Vertex Label's Metadata +This will drop the vertex label `software` but keep the underlying table and also remove the label from any associated edge tables. So e.g. if there is `person->created->software` / `software->generated->software`, then there will be **3** alter statements (one for the table behind `software` and two for the associated edge tables). +``` +schema.vertexLabel('software'). + dropMetadata() +``` + +#### Dropping an Edge Label's Metadata +This will drop the edge label `created` but keep the underlying table. Please note that this will drop **all** connections where this particular label is being used. So e.g. if there is a `person->created->software` and a `software->created->software` connection, then the edge labels of both will be removed. +``` +schema.edgeLabel('created'). + dropMetadata() +``` + +## Dropping all Metadata + +This will drop all the vertex/edge labels but keep the underlying tables. + +``` +schema.dropMetadata() + +``` + +## Adding / Dropping Properties + +#### Adding properties to a Vertex Label +``` +schema.vertexLabel('person'). + addProperty('one', Int). + addProperty('two', Int). + alter() +``` + +#### Adding properties to an Edge Label +You can add properties to every single connection of the same edge label. + +``` +schema.edgeLabel('created'). + addProperty('one', Int). + addProperty('two', Int). + alter() +``` +You can add properties to the specific connection (`person->created->software`) by using `from`/`to`. +``` +schema.edgeLabel('created'). + from('person').to('software'). + addProperty('three', Int). + addProperty('four', Int). + alter() +``` +#### Dropping properties from a Vertex Label +``` +schema.vertexLabel('person'). + dropProperty('one'). + dropProperty('two'). + alter() +``` + +#### Dropping properties from an Edge Label +This will drop the specified properties from every single edge table. +``` +schema.edgeLabel('created'). + dropProperty('one'). + dropProperty('two'). + alter() +``` +You can drop properties from the specific connection (`person->created->software`) by using `from`/`to`. +``` +schema.edgeLabel('created'). + from('person').to('software'). + dropProperty('three'). + dropProperty('four'). + alter() +``` + +## Creating Indexes automatically +Use `schema.indexFor().analyze()` to show the indexes required to execute the ``: +``` +schema.indexFor(g.V().has('age', 30).out().in()).analyze() +``` +``` +Traversal requires that the following indexes are created: +schema.vertexLabel('person').materializedView('person_by_age').ifNotExists().partitionBy('age').clusterBy('name', Asc).clusterBy('year', Asc).create() +schema.edgeLabel('created').from('person').to('software').materializedView('person__created__software_by_software_license_software_name_software_year').ifNotExists().partitionBy(IN, 'license').partitionBy(IN, 'name').partitionBy(IN, 'year').clusterBy(OUT, 'name', Asc).clusterBy(OUT, 'age', Asc).clusterBy(OUT, 'year', Asc).create() +``` + +Use `schema.indexFor().apply()` to create the required indexes for the ``. +``` +schema.indexFor(g.V().has('age', 30).out().in()).apply() +``` +``` +Creating the following indexes: +schema.vertexLabel('person').materializedView('person_by_age').ifNotExists().partitionBy('age').clusterBy('name', Asc).clusterBy('year', Asc).create() +schema.edgeLabel('created').from('person').to('software').materializedView('person__created__software_by_software_license_software_name_software_year').ifNotExists().partitionBy(IN, 'license').partitionBy(IN, 'name').partitionBy(IN, 'year').clusterBy(OUT, 'name', Asc).clusterBy(OUT, 'age', Asc).clusterBy(OUT, 'year', Asc).create() +OK +``` + +More details around the workings of automatic index creation can be found [here](IndexAnalyzer.md). + + +## Creating Indexes manually +Users also have the option to create all indexes manually as shown in the following examples. + +#### Materialized Views + +This example creates a new MV and partitions it by the `coffeePerDay` property so that `g.V().has('coffeePerDay', 4)` can be fulfilled. +``` +schema.vertexLabel('person'). + materializedView('by_coffee'). + partitionBy('coffeePerDay'). + create() +``` + +For edge indexes you can reference the primary key columns from the incident vertices by using `IN` and `OUT` parameters. +In this example the entire view is partitioned by `weight` followed by clustering columns `IN` name of `software` and `OUT` name of `person` in `Desc` order. +``` +schema.edgeLabel('created'). + from('person').to('software'). + materializedView('byWeight'). + ifNotExists(). + partitionBy('weight'). + clusterBy(IN, 'name'). + clusterBy(OUT, 'name', Desc). + create() +``` + +#### Secondary Indexes + +This will create a secondary index on the property/column `coffeePerDay`. +``` +schema.vertexLabel('person'). + secondaryIndex('by_coffee'). + ifNotExists(). + by('coffeePerDay'). + create() +``` + +The below example will create a column `map` of type `frozen(mapOf(Int, Text))` and index it via secondary index. Using `indexFull()` in this example will index the entire map. Available collection indexing options are `indexKeys()` / `indexValues()` / `indexEntries()` / `indexFull()`. See [here](https://docs.datastax.com/en/dse/6.0/cql/cql/cql_using/useIndexColl.html) for additional details about indexing collections. +``` +schema.edgeLabel('person'). + addProperty('map', frozen(mapOf(Int, Text))). + alter() + +schema.edgeLabel('person'). + secondaryIndex('by_map'). + by('map').indexFull(). + create() +``` + +#### Search Indexes +This will index properties/column `name` and `license`. +``` +schema.vertexLabel('software'). + searchIndex(). + by('name'). + by('license'). + create() +``` + +If columns `name` and `license` are indexed, then we can add `year` using: +``` +schema.vertexLabel('software'). + searchIndex(). + by('year'). + create() +``` + +This will index property/column `weight` on the edge label `created` +``` +schema.edgeLabel('created'). + from('person').to('software'). + searchIndex(). + ifNotExists(). + by('weight'). + create() +``` + +##### Indexing Types +When indexing a text column, you can choose to index it using `asString()` / `asText()`. + +``` +schema.vertexLabel('software'). + addProperty('stringProperty', Text). + addProperty('textProperty', Text). + alter() + +schema.vertexLabel('software'). + searchIndex(). + ifNotExists(). + by('stringProperty').asString(). + by('textProperty').asText(). + create() +``` + +* `asString()`: will be using a **non-tokenized** (`StrField`) field +* `asText()`: will be using a **tokenized** (`TextField`) field +* if no indexing type is specified, it will be using a `StrField` and a `TextField` copy field, which means that all textual predicates (token, tokenPrefix, tokenRegex, eq, neq, regex, prefix) will be usable. + +Additional details about indexing types and when to use which one can be found in the DSE [docs](https://docs.datastax.com/en/dse/6.0/dse-dev/datastax_enterprise/graph/using/useSearchIndexes.html). + +##### Dropping Indexed properties + +You can drop a property from the search index using the below syntax: + +``` +schema.vertexLabel('software'). + searchIndex(). + dropProperty('textProperty'). + alter() +``` + +The same applies for edge label search indexes: + +``` +schema.edgeLabel('created'). + from('person').to('software'). + searchIndex(). + dropProperty('weight'). + alter() +``` + + +## Dropping Indexes manually +Indexes can also be dropped manually as shown in the following examples. + +#### Dropping a Materialized View +``` +schema.vertexLabel('person'). + materializedView('by_coffee'). + ifExists(). + drop() +``` + +``` +schema.edgeLabel('created'). + from('person').to('software'). + materializedView('by_r1b'). + ifExists(). + drop() +``` + +#### Dropping a Secondary Index +``` +schema.vertexLabel('person'). + secondaryIndex('non_existing'). + ifExists(). + drop() +``` + +``` +schema.edgeLabel('created'). + from('person').to('software'). + secondaryIndex('by_r2c'). + ifExists(). + drop() +``` + +#### Dropping a Search Index +``` +schema.vertexLabel('person'). + searchIndex(). + drop() +``` + +``` +schema.edgeLabel('created'). + from('person').to('software'). + searchIndex(). + drop() +``` + +## Using Complex Types + +#### List / `frozen` List +``` +schema.vertexLabel('complexList'). + partitionBy('name', Text). + property('major', listOf(Int)). + property('majorfrozen', frozen(listOf(Int))). + create() +``` + +#### Set / `frozen` Set +``` +schema.vertexLabel('complexSet'). + partitionBy('name', Text). + property('major', setOf(Int)). + property('majorfrozen', frozen(setOf(Int))). + create() +``` + +#### Map / `frozen` Map +``` +schema.vertexLabel('complexMap'). + partitionBy('name', Text). + property('versioncodename', mapOf(Int, Varchar)). + property('versioncodenamefrozen', frozen(mapOf(Int, Varchar))). + create() +``` + +#### Tuple +``` +schema.vertexLabel('complexTuple'). + partitionBy('name', Text). + property('versions1', tupleOf(Int, Varchar, Timestamp)). + create() +``` + +Using a collection as an element in a `Tuple` requires the collection to be `frozen`. +``` +schema.vertexLabel('complexTupleNested'). + partitionBy('name', Text). + property('versions1', tupleOf(Varchar, frozen(listOf(Int)))). + property('versions2', tupleOf(Varchar, frozen(setOf(Int)))). + property('versions3', tupleOf(Varchar, frozen(mapOf(Varchar, Int)))). + create() +``` + +Creating a tuple instance via `as` +``` +[2, '3', Instant.now()] as Tuple +``` + +#### Nested Collections + +Using a nested collection requires it to be `frozen`. + +``` +schema.vertexLabel('complexCollectionNested'). + partitionBy('name', Text). + property('majorminor1', listOf(frozen(setOf(Int)))). + property('majorminor2', setOf(frozen(listOf(Int)))). + property('majorminor3', mapOf(Varchar, frozen(listOf(Int)))). + create() +``` + +#### User defined types + +Creating a user defined type via schema: +``` +schema.type('address'). + property('address1', Text). + property('address2', Text). + property('postCode', Text). + create() +``` + +Using a nested user defined type via `typeOf`: +``` +schema.type('contactDetails'). + property('address', frozen(typeOf('address'))). + property('telephone', listOf(Text)). + create() +``` + +Dropping a user defined type: +``` +schema.type('contactDetails').drop() +``` + +Using a user defined type in a label: +``` +schema.vertexLabel('contact'). + partitionBy('name', Text). + property('address', typeOf('address')). + create() +``` + +For each UDT in the keyspace a class will be created that allows the user to use `as` syntax for creation. In this case +`address` is a synthetic type. +Creating and initializing a user defined type via `as`: +``` +[address1: 'add1', address2: 'add2', postCode: 'pc'] as address +``` + +If there are nested complex types the conversion for nested elements will happen automatically. For instance if +`address` has a column of type `nestedUdt` there is no need to specify the nested type: + +``` +[ + address1:'add1', + address2:'add2', + postCode:'pc', + nestedUdt:[nested1:'n1', nested2:'n2'] +] as address + +//Is the same as +[ + address1:'add1', + address2:'add2', + postCode:'pc', + nestedUdt:[nested1:'n1', nested2:'n2'] as nestedUdt +] as address +``` + +## An example that uses all available types + +``` +schema.type('address'). + property('address1', Text). + property('address2', Text). + property('postCode', Text). + create() +``` + +``` +schema.vertexLabel('allTypes'). + partitionBy('id', Int). + property('ascii', Ascii). + property('bigint', Bigint). + property('blob', Blob). + property('boolean', Boolean). + property('date', Date). + property('decimal', Decimal). + property('double', Double). + property('duration', Duration). + property('float', Float). + property('inet', Inet). + property('int', Int). + property('linestring', LineString). + property('point', Point). + property('polygon', Polygon). + property('smallint', Smallint). + property('text', Text). + property('time', Time). + property('timestamp', Timestamp). + property('timeuuid', Timeuuid). + property('tinyint', Tinyint). + property('uuid', Uuid). + property('varchar', Varchar). + property('varint', Varint). + property('list', listOf(Int)). + property('set', setOf(Int)). + property('map', mapOf(Int, Int)). + property('tuple', tupleOf(Int, Int, Int)). + property('udt', typeOf('address')). + create() +``` + + +`Counter` is a special type and cannot be used with other column types, otherwise you will see the error `Cannot mix counter and non counter columns in the same table`. So here's a separate `Counter` example: +``` +schema.vertexLabel('counterExample'). + partitionBy('name', Text). + property('counter', Counter). + create() +``` + + +Here's an overview of how the column types map to their java types: + +| **Column Type** | **Java Type** | +|:-------------- |:---------------| +|Ascii|String.class| +|Bigint|Long.class| +|Blob|ByteBuffer.class| +|Boolean|Boolean.class| +|Counter|Long.class| +|Date|LocalDate.class| +|Decimal|BigDecimal.class| +|Double|Double.class| +|Duration|com.datastax.driver.core.Duration.class| +|Float|Float.class| +|Inet|InetAddress.class| +|Int|Integer.class| +|linestring|com.datastax.driver.dse.geometry.LineString.class| +|point|com.datastax.driver.dse.geometry.Point.class| +|polygon|com.datastax.driver.dse.geometry.Polygon.class| +|Smallint|Short.class| +|Text|String.class| +|Time|LocalTime.class| +|Timestamp|Instant.class| +|Timeuuid|UUID.class| +|Tinyint|Byte.class| +|Uuid|UUID.class| +|Varchar|String.class| +|Varint|BigInteger.class| +|listOf(..)|List.class| +|setOf(..)|Set.class| +|mapOf(..)|Map.class| +|tupleOf(..)|com.datastax.driver.core.TupleValue.class| +|typeOf(..)|com.datastax.driver.core.UDTValue.class| + + +An example of how to insert data for all types is shown below. Please note that you can't do `.property('inet', InetAddress.getByName('localhost'))` because our Sandbox is currently blocking this. +``` +g.addV('allTypes'). + property('id', 232). + property('ascii', 'ascii'). + property('bigint', 23L). + property('blob', [3, 4] as ByteBuffer). + property('boolean', true). + property('date', '2007-12-03' as LocalDate). + property('decimal', 2.3). + property('double', 2.3d). + property('duration', 'PT10H' as Duration). + property('float', 2.3f). + //property('inet', InetAddress.getByName('localhost')). + property('int', 23). + property('linestring', [1, 1, 2, 2, 3, 3] as LineString)). + property('linestring', 'LINESTRING (30 10, 10 30, 40 40)' as LineString)). + property('point', [1.1, 2.2] as Point). + property('point', 'POINT (30 10)' as Point). + property('polygon', [0.9, 0.9, 1.1, 0.9, 1.1, 1.1, 0.9, 1.1] as Polygon). + property('polygon', 'POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))' as Polygon). + property('smallint', 23 as short). + property('text', 'some text'). + property('time', '10:15:30' as LocalTime). + property('timestamp', '2007-12-03T10:15:30.00Z' as Instant). + property('timeuuid', UUIDs.timeBased()). + property('tinyint', 38 as byte). + property('uuid', 'bb6d7af7-f674-4de7-8b4c-c0fdcdaa5cca' as UUID). + property('varchar', 'some text'). + property('varint', 1). + property('list', [1, 2, 3]). + property('set', [1, 2, 3] as Set). + property('map', [k1:'v1', k2:v2]). + property('tuple', [1, 2, 3] as Tuple). + property('udt', [address1:'add1', address2:'add2', postCode:'pc'] as address) +``` + + +## Dropping Schema + +This will drop all the indexes, edge labels, vertex labels and user defined types in the schema. + +``` +schema.drop() +``` + +## Describe Schema + +This will describe all the UDTs, vertex labels, edge labels and indexes in the schema. + +``` +schema.describe() +``` + +## Describe a Vertex Label + +A particular vertex label's schema definition can be described with: + +``` +schema.vertexLabel('person').describe() +``` + +## Describe all Vertex Labels + +It is possible to retrieve the schema definition of all vertex labels by running: + +``` +schema.vertexLabels().describe() +``` + +## Describe an Edge Label + +A particular edge label's schema definition can be described with: + +``` +schema.edgeLabel('created').from('person').to('software').describe() +``` +e.g. `person->created->software` + +A particular edge label's schema definitions can be described with: + +``` +schema.edgeLabel('created').describe() +``` +e.g. `person->created->software`, `software->created->software` + + +## Describe all Edge Labels + +It is possible to retrieve the schema definition of all edge labels by running: + +``` +schema.edgeLabels().describe() +``` + +## Describe a User Defined Type + +A particular user defined type definition can be described with: + +``` +schema.type('address').describe() +``` + +## Describe all User Defined Types + +It is possible to retrieve the definition of all user defined types by running: + +``` +schema.types().describe() +``` diff --git a/dsgraph/graph-docs/TraversalOptions.md b/dsgraph/graph-docs/TraversalOptions.md new file mode 100644 index 0000000..4a3f42e --- /dev/null +++ b/dsgraph/graph-docs/TraversalOptions.md @@ -0,0 +1,78 @@ +# DataStax Graph - Traversal Configurations + +## Native Engine Graph + +#### Changing the limits on allowed mutations + +Traversals that change the data, e.g. traversals containing `addV()`, can +eventually lead up to too many underlying mutating queries. We're now +limiting these to a default of 10k underlying CQL queries, though this limit is +configurable. + +To alter the mutations limit of a traversal, begin your traversal with: +`g.with('max-mutations', N)` where `N` is the new maximum of mutating statements +that can be executed as a result of executing the traversal, e.g. +`g.with('max-mutations', 10).addV(...)`. + +To completely disable the limits check, set the limit to a number (any) that's +`<= 0`, e.g. start your traversal with `g.with('max-mutations', -1).addV(...)`. + + +#### Allow Filtering modulator + +Allow filtering modulator is a mirror function of `ALLOW FILTERING` in C*. +It is not recommended to use in production. +The performance penalty with `ALLOW FILTERING` would still apply. +By default, it is disabled unless specified for a particular traversal. + +Assume that there is a vertex label `customer` without any indexes, the following query: +``` +g.V().has('customer', 'name', 'Cat') +``` +would give you an error: +``` +Traversal requires that the following indexes are created: ... +``` + +By adding the `with` step and `allow-filtering` on the traversal source `g`: +``` +g.with('allow-filtering').V().has('customer', 'name', 'Cat') +``` +The query doesn't require an index and fetches the results from all nodes in the cluster. + +Please note that, if the query is satisfied by an index (e.g. MV, Secondary index, Search index), +the allow filtering option would be ignored. + +#### Forcing vertex deletion in the absence of supporting indexes + +Deleting a vertex also attempts to delete any edges incident on that vertex. If a vertex label has an incident edge label uncovered by any index or table in at least one direction, then graph cannot efficiently find those incident edges for deletion (or determine that they do not exist). By default, graph throws an exception in this case. + +The `force-vertex-deletion` option can override this exception and perform deletion to the extent supported by the schema. When `force-vertex-deletion` is true, the vertex will be deleted, and so will any incident edges covered by a usable index or for which the edge table is keyed by the deleted vertex ID. Any other incident edges -- those for which a scan would be required -- will not be deleted. This option can potentially leave edge table rows pointing to a vertex that no longer exists. + +``` +g.with('force-vertex-deletion').V().has('name', 'Cat').drop() +``` + +#### Ignoring non-indexed data + +For graph exploration type applications it is sometimes useful to be able to execute traversals and get **partial** results back and ignore data that cannot be accessed without an index. + +Assuming the existence of `customer` and `meal` vertex labels where only `customer` has an index on `name`, the query `g.V().has('name', 'Cat')` would result in: +``` +Traversal requires that the following indexes are created: ... +``` + +By adding the `with` step and `ignore-unindexed` on the traversal source `g`, the below query will return only `customer` vertices: +``` +g.with('ignore-unindexed').V().has('name', 'Cat') +``` + +--- + +**NOTE** + +There's no additional performance penalty to using `with('ignore-unindexed')`. The result set might contain **partial** results and ignore data that cannot be accessed without an index. + +--- + + diff --git a/dsgraph/graph-docs/WhatsNewInDataStaxGraph.md b/dsgraph/graph-docs/WhatsNewInDataStaxGraph.md new file mode 100644 index 0000000..6c5aa2f --- /dev/null +++ b/dsgraph/graph-docs/WhatsNewInDataStaxGraph.md @@ -0,0 +1,254 @@ +# What's new in DataStax Graph? + +## Improved Error Messaging + +CTRL-C in gremlin console aborts the current request rather than exits. Use the `:exit` command to exit the console. + +``` +gremlin> :exit +``` + +## Improved Error Messaging + +DataStax Graph will provide improved and detailed error messages in case a traversal cannot be fulfilled due to missing indexes. +The output will look as shown below and will contain information such as: + +* the executed traversal that failed +* details of the step that failed +* the CQL that was executed and failed +* an index suggestion that can be applied in order to fulfill the traversal +* alternative approaches to creating an index + + +``` +gremlin> g.V().hasLabel("a").has("age", 23) +One or more indexes are required to execute the traversal: g.V().hasLabel("a").has("age",(int) 23) +Failed step: DseGraphStep(__.V().hasLabel("a").has("age",(int) 23)) +CQL execution: No table or view could satisfy the query 'SELECT * FROM bla.a WHERE age = ?' +The output of 'schema.indexFor().analyze()' suggests the following indexes could be created to allow execution: + +schema.vertexLabel('a').materializedView('a_by_age').ifNotExists().partitionBy('age').clusterBy('id', Asc).create() + +Alternatively consider using: +g.with('ignore-unindexed') to ignore unindexed traversal. Your results may be incomplete. +g.with('allow-filtering') to allow filtering. This may have performance implications. +``` + +The main benefit is that it helps users in understanding what went wrong at a particular step and what can be done to resolve it. + +## Index suggestion mechanism + +DataStax Graph has the ability to figure out what indexes a given traversal requires. This can be done by executing `schema.indexFor().analyze()` as shown below: + +``` +gremlin> schema.indexFor(g.V().has("age", 23)).analyze() +==>Traversal requires that the following indexes are created: +schema.vertexLabel('person').materializedView('person_by_age').ifNotExists().partitionBy('age').clusterBy('firstname', Asc).clusterBy('lastname', Asc).create() +``` + +The index analyzer will also indicate in case a traversal can be fulfilled by existing indexes: +``` +gremlin> schema.indexFor(g.V().has("name", "x")).analyze() +==>Traversal can be satisfied by existing indexes +``` + +Index suggestions can also be directly applied by executing `schema.indexFor().apply()`: + +``` +gremlin> schema.indexFor(g.V().has("age", 23)).apply() +==>Creating the following indexes: +schema.vertexLabel('person').materializedView('person_by_age').ifNotExists().partitionBy('age').clusterBy('firstname', Asc).clusterBy('lastname', Asc).create() +OK +``` + +Additional details to the index suggestion mechanism can be found [here](IndexAnalyzer.md). + + +## Simplified and improved Schema API + +DataStax Graph has an improved Schema API that allows the creation/modification of vertex/edge labels and indexes. +The Schema API abstracts away complexity and is also closer to Cassandra terminology when it comes to partitioning data through the usage of `partitionBy(..)` / `clusterBy(..)`. + +The following example creates a vertex label `person`, having a partition key on `name` / `ssn` and using `age` as a clustering column. The example also creates a vertex label `software`, using `name` as partition key, `version` and `lang` as clustering columns. `temp` ends up being a regular and `static_property` a static column: +``` +schema.vertexLabel('person'). + ifNotExists(). + partitionBy('name', Text). + partitionBy('ssn', Text). + clusterBy('age', Int). + property('address', Text). + property('coffeePerDay', Int). + create() + +schema.vertexLabel('software'). + ifNotExists(). + partitionBy('name', Text). + clusterBy('version', Int). + clusterBy('lang', Text). + property('temp', Text). + property('static_property', Text, Static). + create() +``` + +Given the two vertex labels `person` and `software`, one can create the connection `person-created->software` as shown below: + +``` +schema.edgeLabel('created'). + ifNotExists(). + from('person').to('software'). + property('weight', Double). + create() +``` + +An example of creating a MV and partition it by the `age` property so that graph queries against `age` can be fulfilled is shown below: +``` +schema.vertexLabel('person'). + materializedView('by_age'). + partitionBy('age'). + create() +``` + +Additional Schema API details with usage examples can be found [here](SystemAndSchemaAPI.md). + +## Transparent Data Model + +DataStax Graph uses a more transparent data model with the following characteristics: + +* graph = CQL keyspace +* vertex/edge label = CQL table +* property of the underlying vertex/edge label = CQL column + +This means that users can keep their existing data and let a keyspace be treated as a graph in order to perform graph traversals. The data is then explorable through graph and CQL tools. +Additionally, bulk loading graph data can be performed through CQL without having to use a custom tool. + + +## CQL grammar to specify Graph Metadata on Keyspaces/Tables + +DataStax Graph comes with new CQL grammar that allows keyspaces to be treated as graphs and tables to be treated as vertex/edge labels in that graph. This is especially helpful for users that would like to convert their existing data to a graph. + +Executing `ALTER KEYSPACE ks WITH AND graph_engine = 'Native'` on an existing keyspace will treat that keyspace as a graph in DSE. + +By executing `ALTER TABLE ks.tbl WITH VERTEX LABEL ` or `ALTER TABLE ks.tbl WITH EDGE LABEL FROM vLabelOne(...) TO vLabelTwo(...)` a CQL table can be represented as a vertex/edge label. + +The [Schema API](SystemAndSchemaAPI.md) abstracts away the CQL grammar and easy conversion of existing keyspaces/tables to graphs/element labels. +For a given keyspace, one can execute `system.graph("test").fromExistingKeyspace().create()`. +One can convert an existing table to a vertex label using `schema.vertexLabel('book').fromExistingTable('book_table').create()`. + +Additional details around the CQL grammar with usage examples can be found [here](CQLGraphExtensions.md). Details and examples around the Schema API can be found [here](SystemAndSchemaAPI.md). + +## Improved Profiling output + +The new `.profile()` output shows details about the steps of a given traversal and the CQL each step needs to execute. CQL statements are grouped and include duration information. The improved output format helps determining where potential bottlenecks reside. +The output of `g.V().has("id", 1).profile()` might look as shown below: + +``` +Step Count Traversers Time (ms) % Dur +============================================================================================================= +DseGraphStep(__.V().has("id",(int) 1)) 3 3 20.200 70.69 + CQL statements ordered by overall duration 35.418 + \_1=SELECT * FROM sample.company WHERE id = ? / Duration: 11 ms / Count: 1 + \_2=SELECT * FROM sample.person WHERE id = ? / Duration: 11 ms / Count: 1 + \_3=SELECT * FROM sample.software WHERE id = ? / Duration: 11 ms / Count: 1 +HasStep([id.eq(1)]) 3 3 6.915 24.20 +ReferenceElementStep 3 3 1.461 5.12 + >TOTAL - - 28.578 - +``` + +Appending `.out("works_with")` to the previous traversal in order to get `g.V().hasLabel("person").has("id", 1).out("works_with").profile()` might result in: +``` +Step Count Traversers Time (ms) % Dur +============================================================================================================= +DseGraphStep(__.V().hasLabel("person").has("id"... 1 1 1.860 11.28 + CQL statements ordered by overall duration 0.955 + \_1=SELECT * FROM sample.person WHERE id = ? / Duration: < 1 ms / Count: 1 +HasStep([~label.eq(person), id.eq(1)]) 1 1 0.849 5.15 +DseVertexStep(__.out().hasLabel("works_with")) 1 1 12.637 76.64 + CQL statements ordered by overall duration 3.993 + \_1=SELECT * FROM sample.person__works_with__software WHERE person_id = ? + AND person_age = ? / Duration: 2 ms / Count: 1 + \_2=SELECT * FROM sample.software WHERE id = ? AND age = ? / Duration: 1 ms / Cou + nt: 1 +ReferenceElementStep 1 1 1.141 6.92 + >TOTAL - - 16.489 - +``` + +## Full C* Type Alignment + +DataStax Graph supports all C* types, including complex types, such as CQL collections and UDTs / Tuples. Below is an example that shows a vertex label with all types: + +``` +schema.type('address'). + property('address1', Text). + property('address2', Text). + property('postCode', Text). + create() + +schema.vertexLabel('allTypes'). + partitionBy('id', Int). + property('ascii', Ascii). + property('bigint', Bigint). + property('blob', Blob). + property('boolean', Boolean). + property('date', Date). + property('decimal', Decimal). + property('double', Double). + property('duration', Duration). + property('float', Float). + property('inet', Inet). + property('int', Int). + property('linestring', LineString). + property('point', Point). + property('polygon', Polygon). + property('smallint', Smallint). + property('text', Text). + property('time', Time). + property('timestamp', Timestamp). + property('timeuuid', Timeuuid). + property('tinyint', Tinyint). + property('uuid', Uuid). + property('varchar', Varchar). + property('varint', Varint). + property('list', listOf(Int)). + property('set', setOf(Int)). + property('map', mapOf(Int, Int)). + property('tuple', tupleOf(Int, Int, Int)). + property('udt', typeOf('address')). + create() +``` + +Additional information about supported types can be found in the [Schema API](SystemAndSchemaAPI.md). + + +## Direct edge lookup may be used + +If edges are indexed then they may be queried directly rather than going via a vertex. +``` +schema.edgeLabel('created'). + from('person').to('software'). + materializedView('byWeight'). + ifNotExists(). + partitionBy('weight'). + clusterBy(IN, 'name'). + clusterBy(OUT, 'name', Desc). + create() + +g.E().hasLabel('created').has('since', '2002').inV() +``` + + +## Search indexes may be used for edges + +DataStax Graph now allows search indexes to be used on edges. In particular this is useful for tokenized edge queries, but +also any other predicate supported by search. + +``` +schema.edgeLabel('created'). + from('person').to('software'). + searchIndex(). + ifNotExists(). + by(OUT, 'name'). + by('weight'). + create() + +g.V().outE().has('weight', neq(0.8)) +``` diff --git a/dsgraph/graph-migration/README.md b/dsgraph/graph-migration/README.md new file mode 100644 index 0000000..99ea5f3 --- /dev/null +++ b/dsgraph/graph-migration/README.md @@ -0,0 +1,80 @@ +# Migrate Classic Graph to Native Graph + + +## Generate Native Graph Schema +Graph Schema Migration tool creates classic graph frame. Based on classic graph frame schema, the tool generates native graph schema. + +There are two types of format when outputting the schema, cql and gremlin. +Cql format schema is the CQL script to create graph keyspace and tables under cqlsh. +Gremlin format schema is the groovy script to create native graph schema under gremlin-console or Studio. + +Run the following command to output the native graph schema generation script when the DSE server is up: +``` +dse graph-migrate-schema [-cql | -gremlin] +``` + +- `-cql`: specifies the output as cql format. +- `-gremlin`: specifies the output as gremlin format. +- ``: the name of the existing classic graph you would like to migrate from. +- ``: the name of the new native graph you would like to create. + +Please note that the following are NOT supported: +- Meta properties +- Multi properties +- Indexes + +All properties are single. Multi properties are migrated as a single property. Meta properties, Indexes, and MVs are dropped. +If custom handling of meta and multi properties is needed, users should modify the generated native graph schema accordingly, some properties may need to be renamed or dropped. + +To save the generated output as a script into a file, you may run the following: +``` +dse graph-migrate-schema -gremlin reviewerRating nreviewerRating > native_schema.groovy +``` + +To create the native graph schema, pass the generated script to gremlin-console or cqlsh depending on the type of format. + +Example: +``` +dse gremlin-console -e native_schema.groovy +``` + +## Migrate Data + +The following provided script is just an example of the data migration process. + +Please note that if you have modified the generated schema in the previous step, you should modify the migration script accordingly. See "Development notes" for more information. + +build: +``` +sbt package +``` + +and then submit to Spark: +``` +dse spark-submit target/scala-2.11/graph-migration_2.11-0.1.jar +``` + +Example: +``` +dse spark-submit target/scala-2.11/graph-migration_2.11-0.1.jar reviewerRating nreviewerRating +``` + +### Development notes + +Reference: [com.datastax.graph.MigrateData](src/main/scala/com/datastax/graph/MigrateData.scala) + +- `migrateVertices()` + + This method enumerates native vertex labels, selects proper rows from classic DGF vertices and writes them to appropriate tables. + +- `migrateEdges()` + + This method enumerates native edge labels, extracts (inLabel, edgeLabel, outLabel) triplet from it, selects proper rows from classic DGF edges, converts edge ids into the native format and writes data to the corresponded table. + +- `handleMultiAndMetaProperies()` + + This method is expected to be overridden to properly migrate multi and meta properties. By default, it just drops all metadata and selects the first multi properties. + +- `getLegacyPropertyName()` + + This method should be overridden if the property name was manually changed in the schema generated by the `graph-migrate-schema`. diff --git a/dsgraph/graph-migration/build.sbt b/dsgraph/graph-migration/build.sbt new file mode 100644 index 0000000..252d8c8 --- /dev/null +++ b/dsgraph/graph-migration/build.sbt @@ -0,0 +1,30 @@ +name := "graph-migration" + +version := "0.1" + +scalaVersion := "2.11.8" + +// Please make sure that following DSE version matches your DSE cluster version. +val dseVersion = "6.8.0" + +resolvers += "DataStax Repo" at "https://repo.datastax.com/public-repos/" +resolvers += Resolver.mavenLocal // for testing + +mainClass in (Compile, packageBin) := Some("com.datastax.graph.MigrateData") + +// Warning Sbt 0.13.13 or greater is required due to a bug with dependency resolution +//libraryDependencies += "com.datastax.dse" % "dse-spark-dependencies" % dseVersion % "provided" + +// WORKAROUND for non published DSE. +// all folowing code should be removed after public release. dse-spark-dependencies artiact is enough +// please set DSE_HOME env variable + +val DSE_HOME = file(sys.env.getOrElse("DSE_HOME", sys.env("HOME")+"/dse")) +// find all jars in the DSE +def allDseJars = { + val finder: PathFinder = (DSE_HOME) ** "*.jar" + finder.get +} +// add all jars to dependancies +unmanagedJars in Compile ++= allDseJars +unmanagedJars in Test ++= allDseJars diff --git a/dsgraph/graph-migration/src/main/scala/com/datastax/graph/MigrateData.scala b/dsgraph/graph-migration/src/main/scala/com/datastax/graph/MigrateData.scala new file mode 100644 index 0000000..2c9d3c3 --- /dev/null +++ b/dsgraph/graph-migration/src/main/scala/com/datastax/graph/MigrateData.scala @@ -0,0 +1,196 @@ +/* + * Copyright DataStax, Inc. + * + * Please see the included license file for details. + */ + +package com.datastax.graph + +import com.datastax.bdp.graph.spark.graphframe._ +import com.datastax.bdp.graph.spark.graphframe.dsedb.NativeDseGraphFrame +import com.datastax.bdp.graph.spark.graphframe.legacy.LegacyDseGraphFrame +import com.datastax.bdp.graphv2.dsedb.schema.Column.{ColumnType => NativeColumnType, Type => NativeType} +import com.datastax.bdp.graphv2.engine.GraphKeyspace +import com.datastax.bdp.graphv2.engine.GraphKeyspace.VertexLabel +import org.apache.spark.sql._ +import org.apache.spark.sql.cassandra._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{ArrayType, StructField, StructType} + +import scala.collection.JavaConverters._ + +object MigrateData { + + /** + * this method have to be overridden to handle multi properties that a represented as array and properites with meta + * that are represented as SparkSQL struct + * by default meta-properties are just dropped. + * the first of multi properties is returned + * + * @param rawColumn + * @return column with selected data + */ + def handleMultiAndMetaProperties(rawColumn: StructField) = { + val c = col(rawColumn.name) + rawColumn.dataType match { + // handle mutli property with meta-properties + // extract first first multi-property with c(0) and drop meta properties with ("value") + case ArrayType(StructType(fields), containsNull) => c(0)("value") + // handle mutli property without meta-properties + case ArrayType(field, containsNull) => c(0) + // drop meta-properties from single property + case StructType(fields) => c("value") + // no need to checnge type + case _ => c + } + } + + /** + * the method should be overridden to match properties renamed during schema migration + * default implementation assumes that property names was not changed and apply only toGfName function to it + * + * @param property Native Graph property + * @return property name in provided Vertex and Edge dataframe + */ + def getLegacyPropertyName(property: GraphKeyspace.PropertyKey): String = { + DseGraphFrame.toGfName(property.name()) + } + + /** + * this method should be overridden in case vertex ids were changed. + * The default implementation assumes that vertex ids are the same and only call getLegacyPropertyName() for them + * The method is used only during edge migration, additional steps are needed for vertex migration + * for example the method replace "src" column with a number of "in_" columumn for DSE native edge table. + * @param df to modify + * @param label native vertex schema to extract id structure + * @param legacy legacy schema to extract legacy vertex id structure + * @param idName "src" or "dst" + * @return dataframe with added native vertex id columns and removed idName column + */ + + def addEdgeIdColumns(df: DataFrame, label: VertexLabel, legacy: LegacyDseGraphFrame, idName: String): DataFrame = { + val vertex = legacy.graphSchema.getVertex(label.name) + + var newDf = LegacyDseGraphFrame.addNaturalVertexIdColumns(df, label.name(), legacy.graphSchema, col(idName)) + // we assumes that property names was not changed + for (prop <- label.primaryPropertyKeys().asScala) { + newDf = newDf.withColumnRenamed(getLegacyPropertyName(prop), prop.column().get().name()) + } + newDf.drop(idName) + } + + /** + * Load vertices separately. The target schema should be created and modified before this call + * For example after applying custom rules for multi and meta properties. + * + * @param vertices vertices Dataframe + * @param nativeGraphName target graph + * @param spark current spark session + */ + def migrateVertices(legacy: LegacyDseGraphFrame, native: NativeDseGraphFrame, spark: SparkSession): Unit = { + val vertices = legacy.V.df + + // vertex labels to enumerate + val vertexLabels: Seq[GraphKeyspace.VertexLabel] = native.graphSchema.vertexLabels().asScala.toSeq + val dfSchema = vertices.schema + for (vertexLabel: GraphKeyspace.VertexLabel <- vertexLabels) { + //prepare native vertex columns for this label + val propertyColumns = vertexLabel.propertyKeys().asScala.map((property: GraphKeyspace.PropertyKey) => { + val name: String = getLegacyPropertyName(property) + val rawColumn: StructField = dfSchema(name) + // drop meta and multi properties. the method can be changed to return Seq[Column] if more then one column + // is created base on one legacy property + val finalColumn = handleMultiAndMetaProperties(rawColumn) + // Duration type representation is changed, the line could be removed if no Duration used in schema + val scaleColumn = durationToNanoseconds(property.column().get.`type`(), finalColumn) + scaleColumn as DseGraphFrame.toGfName(property.name()) + }) + + // filter row and columns related to the given label + val vertexDF = vertices.filter(col(DseGraphFrame.LabelColumnName) === vertexLabel.name()) + .select(propertyColumns: _*) + // save vertices in the native graph + native.updateVertices(vertexLabel.name(), vertexDF) + } + } + + private def durationToNanoseconds(columnType: NativeColumnType, col: Column): Column = { + if (columnType == NativeType.Duration) col * 1000000 else col + } + + /** + * Load edges separately. The target schema should be created and modified before this call + * + * @param edges edge Dataframe + * @param legacySchema old graph scheama for id conversions. legacyGraph.schema() call returns it. + * @param nativeGraphName target graph + * @param spark current spark session + */ + + def migrateEdges(legacy: LegacyDseGraphFrame, native: NativeDseGraphFrame, spark: SparkSession): Unit = { + // it could be good to cache edges here + val edges = legacy.E.df + + val dfSchema = edges.schema + // enumerate all edge labels, actually triplets: out_vertex_label->edge_label->in_vertex_label + for (edgeLabel <- native.graphSchema.edgeLabels().asScala.toSeq) { + val outLabelName = edgeLabel.outLabel.name() + val edgeLabelName = edgeLabel.name() + val inLabelName = edgeLabel.inLabel.name() + val propertyColumns = edgeLabel.propertyKeys().asScala.map(property => { + // legacy edge internal property "id" is mapped to native "id" column + val name = if(property.name() == "id") "id" else getLegacyPropertyName(property) + val scaleColumn = durationToNanoseconds(property.column().get.`type`(), col(name)) + scaleColumn as name + }) + // filter data for one native DSE-DB table + val singleEdgeTable = edges.filter( + (col("~label") === edgeLabelName) and + col("src").startsWith(outLabelName + ":") and + col("dst").startsWith(inLabelName + ":")) + .select((propertyColumns :+ col("src")) :+ col("dst"): _*) + // replace "src" column with unpacked out_ columns + val unpackSrcTable = addEdgeIdColumns(singleEdgeTable, edgeLabel.outLabel, legacy, "src") + // replace "dst" column with unpacked in_ columns + val unpackDstTable = addEdgeIdColumns(unpackSrcTable, edgeLabel.inLabel, legacy, "dst") + + // save edges in the native graph + native.updateEdges(outLabelName, edgeLabelName, inLabelName, unpackDstTable) + + } + } + + def main(args: Array[String]): Unit = { + if (args.length < 2) { + usage(); + System.exit(-1); + } + val legacyGraphName = args(0) + val nativeGraphName = args(1) + + val spark = SparkSession + .builder + .appName(s"Migrate data from $legacyGraphName to $nativeGraphName") + .getOrCreate() + + try { + val legacy = spark.dseGraph(legacyGraphName).asInstanceOf[LegacyDseGraphFrame] + val native = spark.dseGraph(nativeGraphName).asInstanceOf[NativeDseGraphFrame] + + migrateVertices(legacy, native, spark) + migrateEdges(legacy, native, spark) + + } catch { + case e: Exception => { + e.printStackTrace() + usage() + } + } finally { + spark.stop() + } + } + def usage(): Unit = { + println("\nUsage: data_migration legacyGraphName nativeGraphName") + } + +}