Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,12 @@ Cask is a trademark of Cask Data, Inc. All rights reserved.

Apache, Apache HBase, and HBase are trademarks of The Apache Software Foundation. Used with
permission. No endorsement by The Apache Software Foundation is implied by the use of these marks.
## ByteSize and TimeDuration Parsers

Wrangler now supports parsing data size and time duration units in recipes.

**Supported Units**:
- ByteSize: KB, MB, GB, TB
- TimeDuration: ms, s, m, h

### New Directive: `aggregate-stats`
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Copyright [year] [your name or organization]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


package io.cdap.wrangler.api.parser;

import com.google.gson.JsonElement;
import com.google.gson.JsonPrimitive;

import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Token class to parse and represent byte size values like "10KB", "5MB", etc.
*/
public class ByteSize implements Token {
private static final Map<String, Long> UNIT_MULTIPLIERS = new HashMap<>();
private static final Pattern PATTERN = Pattern.compile("(?i)(\\d+(\\.\\d+)?)([KMGTPE]?B)");

static {
UNIT_MULTIPLIERS.put("B", 1L);
UNIT_MULTIPLIERS.put("KB", 1024L);
UNIT_MULTIPLIERS.put("MB", 1024L * 1024);
UNIT_MULTIPLIERS.put("GB", 1024L * 1024 * 1024);
UNIT_MULTIPLIERS.put("TB", 1024L * 1024 * 1024 * 1024);
UNIT_MULTIPLIERS.put("PB", 1024L * 1024 * 1024 * 1024 * 1024);
UNIT_MULTIPLIERS.put("EB", 1024L * 1024 * 1024 * 1024 * 1024 * 1024);
}

private final long bytes;
private final String original;

public ByteSize(String value) {
this.original = value;
this.bytes = parse(value);
}

private long parse(String input) {
Matcher matcher = PATTERN.matcher(input.trim());

if (!matcher.matches()) {
throw new IllegalArgumentException("Invalid byte size: " + input);
}

double number = Double.parseDouble(matcher.group(1));
String unit = matcher.group(3).toUpperCase();

Long multiplier = UNIT_MULTIPLIERS.get(unit);
if (multiplier == null) {
throw new IllegalArgumentException("Unsupported byte unit: " + unit);
}

return (long) (number * multiplier);
}

public long getBytes() {
return bytes;
}

@Override
public Object value() {
return bytes;
}

@Override
public TokenType type() {
return TokenType.BYTE_SIZE;
}

@Override
public JsonElement toJson() {
return new JsonPrimitive(original);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Copyright © 2017-2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package io.cdap.wrangler.api.parser;

import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* A {@link Token} that represents a time duration value with units.
*/

public class TimeDuration implements Token {
private static final Pattern PATTERN = Pattern.compile("(\\d+(?:\\.\\d+)?)([Nn][Ss]|[Mm][Ss]|[Ss])");
private final long nanoseconds;
private final String originalValue;

public TimeDuration(String value) {
this.originalValue = value;
Matcher matcher = PATTERN.matcher(value);
if (!matcher.matches()) {
throw new IllegalArgumentException("Invalid time duration format: " + value);
}

double number = Double.parseDouble(matcher.group(1));
String unit = matcher.group(2).toUpperCase();

switch (unit) {
case "NS":
nanoseconds = (long) number;
break;
case "MS":
nanoseconds = (long) (number * 1_000_000);
break;
case "S":
nanoseconds = (long) (number * 1_000_000_000);
break;
default:
throw new IllegalArgumentException("Unsupported time duration unit: " + unit);
}
}

@Override
public Object value() {
return String.format("%.2f%s", getSeconds(), "s");
}

@Override
public TokenType type() {
return TokenType.TIME_DURATION;
}

@Override
public JsonElement toJson() {
JsonObject object = new JsonObject();
object.addProperty("type", type().name());
object.addProperty("value", originalValue);
object.addProperty("nanoseconds", nanoseconds);
return object;
}

public long getNanoseconds() {
return nanoseconds;
}

public double getMilliseconds() {
return nanoseconds / 1_000_000.0;
}

public double getSeconds() {
return nanoseconds / 1_000_000_000.0;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
* @see Expression
* @see Text
* @see TextList
* @see Identifier
* @see ByteSize
* @see TimeDuration
*/
@PublicEvolving
public enum TokenType implements Serializable {
Expand Down Expand Up @@ -152,5 +155,17 @@ public enum TokenType implements Serializable {
* Represents the enumerated type for the object of type {@code String} with restrictions
* on characters that can be present in a string.
*/
IDENTIFIER
IDENTIFIER,
/**
* Represents the enumerated type for the object of type {@code ByteSize}.
* This type is associated with strings such as "1KB", "50MB", "5GB", etc.
*/
BYTE_SIZE,

/**
* Represents the enumerated type for the object of type {@code TimeDuration}.
* This type is associated with strings such as "100ms", "2s", "5m", etc.
*/
TIME_DURATION

}
Loading