|
| 1 | +""" |
| 2 | +Helpers for extracting copy, cut, and paste signals from writing analytics |
| 3 | +events. |
| 4 | +""" |
| 5 | + |
| 6 | +from __future__ import annotations |
| 7 | + |
| 8 | +import datetime as dt |
| 9 | +import re |
| 10 | + |
| 11 | + |
| 12 | +PASTE_WAIT_MS = 2500 |
| 13 | +MENU_FLAG_MS = 1500 |
| 14 | +DEDUP_MS = 750 |
| 15 | +BIG_PASTE_THRESHOLD = 200 |
| 16 | + |
| 17 | +DOC_URL_RE = re.compile(r"^https://docs.google.com/document/d/(?P<DOCID>[^/\s]+)/(?P<ACT>[a-zA-Z]+)") |
| 18 | + |
| 19 | + |
| 20 | +def unwrap_event(event): |
| 21 | + if isinstance(event, dict) and isinstance(event.get("client"), dict): |
| 22 | + return event["client"] |
| 23 | + return event if isinstance(event, dict) else {} |
| 24 | + |
| 25 | + |
| 26 | +def get_doc_id(event): |
| 27 | + client = unwrap_event(event) |
| 28 | + doc_id = client.get("doc_id") |
| 29 | + if doc_id: |
| 30 | + return doc_id |
| 31 | + |
| 32 | + url = client.get("object", {}).get("url") |
| 33 | + if not url or not DOC_URL_RE.match(url): |
| 34 | + return None |
| 35 | + |
| 36 | + return client.get("object", {}).get("id") |
| 37 | + |
| 38 | + |
| 39 | +def event_action(client): |
| 40 | + action = client.get("action") or client.get("event") or client.get("type") or client.get("event_type") or "" |
| 41 | + if action: |
| 42 | + return str(action).lower() |
| 43 | + keystroke = client.get("keystroke", {}) if isinstance(client.get("keystroke"), dict) else {} |
| 44 | + return str(keystroke.get("action") or keystroke.get("type") or "").lower() |
| 45 | + |
| 46 | + |
| 47 | +def keys_info(client): |
| 48 | + keystroke = client.get("keystroke", {}) if isinstance(client.get("keystroke"), dict) else {} |
| 49 | + key = keystroke.get("key") or client.get("key") or "" |
| 50 | + code = keystroke.get("code") or client.get("code") or "" |
| 51 | + event_type = keystroke.get("type") or client.get("type") or "" |
| 52 | + ctrl = bool(keystroke.get("ctrl") or client.get("ctrl") or keystroke.get("ctrlKey") or client.get("ctrlKey")) |
| 53 | + meta = bool(keystroke.get("metaKey") or client.get("metaKey")) |
| 54 | + key_code = keystroke.get("keyCode") or client.get("keyCode") or keystroke.get("which") or client.get("which") |
| 55 | + return { |
| 56 | + "key": str(key).lower() if key else "", |
| 57 | + "code": str(code), |
| 58 | + "event_type": str(event_type).lower(), |
| 59 | + "ctrl_or_meta": bool(ctrl or meta), |
| 60 | + "key_code": int(key_code) if isinstance(key_code, (int, float)) else None, |
| 61 | + } |
| 62 | + |
| 63 | + |
| 64 | +def is_copy(client): |
| 65 | + action = event_action(client) |
| 66 | + if action in {"copy", "clipboard_copy", "gdocs_copy", "menu_copy", "edit_copy"}: |
| 67 | + return True |
| 68 | + info = keys_info(client) |
| 69 | + return info["event_type"] == "keydown" and info["ctrl_or_meta"] and ( |
| 70 | + info["key"] == "c" or info["code"] == "KeyC" or info["key_code"] == 67 |
| 71 | + ) |
| 72 | + |
| 73 | + |
| 74 | +def is_cut(client): |
| 75 | + action = event_action(client) |
| 76 | + if action in {"cut", "clipboard_cut", "gdocs_cut", "menu_cut", "edit_cut"}: |
| 77 | + return True |
| 78 | + info = keys_info(client) |
| 79 | + return info["event_type"] == "keydown" and info["ctrl_or_meta"] and ( |
| 80 | + info["key"] == "x" or info["code"] == "KeyX" or info["key_code"] == 88 |
| 81 | + ) |
| 82 | + |
| 83 | + |
| 84 | +def is_paste_keyboard(client): |
| 85 | + action = event_action(client) |
| 86 | + if action in {"paste", "clipboard_paste", "gdocs_paste", "insert_from_clipboard"}: |
| 87 | + return True |
| 88 | + info = keys_info(client) |
| 89 | + return info["event_type"] == "keydown" and info["ctrl_or_meta"] and ( |
| 90 | + info["key"] == "v" or info["code"] == "KeyV" or info["key_code"] == 86 |
| 91 | + ) |
| 92 | + |
| 93 | + |
| 94 | +def looks_like_menu_paste(client): |
| 95 | + action = event_action(client) |
| 96 | + if action in {"menu_paste", "edit_paste", "contextmenu_paste"}: |
| 97 | + return True |
| 98 | + return action == "contextmenu" |
| 99 | + |
| 100 | + |
| 101 | +def timestamp_ms(event, client=None): |
| 102 | + client = client or unwrap_event(event) |
| 103 | + for source in (client, event): |
| 104 | + for key in ("timestamp", "ts", "time", "t"): |
| 105 | + value = source.get(key) |
| 106 | + if isinstance(value, (int, float)): |
| 107 | + return int(value * 1000) if value < 10**11 else int(value) |
| 108 | + |
| 109 | + server = event.get("server", {}) if isinstance(event, dict) else {} |
| 110 | + value = server.get("time") if isinstance(server, dict) else None |
| 111 | + if isinstance(value, (int, float)): |
| 112 | + return int(value * 1000) |
| 113 | + |
| 114 | + return int(dt.datetime.utcnow().timestamp() * 1000) |
| 115 | + |
| 116 | + |
| 117 | +def collect_inserted_text(command, output): |
| 118 | + if not isinstance(command, dict): |
| 119 | + return |
| 120 | + |
| 121 | + if command.get("ty") == "is": |
| 122 | + string_value = command.get("s") |
| 123 | + if isinstance(string_value, str) and string_value: |
| 124 | + output.append(string_value) |
| 125 | + |
| 126 | + if isinstance(command.get("nmc"), dict): |
| 127 | + collect_inserted_text(command["nmc"], output) |
| 128 | + |
| 129 | + for child in command.get("mts") or []: |
| 130 | + collect_inserted_text(child, output) |
| 131 | + |
| 132 | + |
| 133 | +def extract_insert_from_gdocs_save(client): |
| 134 | + parts = [] |
| 135 | + for bundle in client.get("bundles") or []: |
| 136 | + for command in (bundle or {}).get("commands") or []: |
| 137 | + collect_inserted_text(command, parts) |
| 138 | + return "".join(parts) |
| 139 | + |
| 140 | + |
| 141 | +def paste_length_bin(length): |
| 142 | + if length <= 0: |
| 143 | + return "none" |
| 144 | + if length <= 20: |
| 145 | + return "short_1_20" |
| 146 | + if length <= 200: |
| 147 | + return "medium_21_200" |
| 148 | + return "long_201_plus" |
| 149 | + |
| 150 | + |
| 151 | +def append_recent(items, entry, limit=10): |
| 152 | + items = list(items or []) |
| 153 | + items.append(entry) |
| 154 | + if len(items) > limit: |
| 155 | + items = items[-limit:] |
| 156 | + return items |
| 157 | + |
| 158 | + |
| 159 | +def default_paste_state(): |
| 160 | + return { |
| 161 | + "paste_count": 0, |
| 162 | + "pastes_with_length": 0, |
| 163 | + "total_paste_chars": 0, |
| 164 | + "max_paste_len": 0, |
| 165 | + "last_paste_len": 0, |
| 166 | + "big_pastes": 0, |
| 167 | + "length_bins": { |
| 168 | + "short_1_20": 0, |
| 169 | + "medium_21_200": 0, |
| 170 | + "long_201_plus": 0, |
| 171 | + }, |
| 172 | + "recent_pastes": [], |
| 173 | + "awaiting_paste_until": 0, |
| 174 | + "maybe_menu_paste_until": 0, |
| 175 | + "last_paste_signal_ms": 0, |
| 176 | + } |
| 177 | + |
| 178 | + |
| 179 | +def default_copy_cut_state(): |
| 180 | + return { |
| 181 | + "copy_count": 0, |
| 182 | + "cut_count": 0, |
| 183 | + "last_copy_ts": 0, |
| 184 | + "last_cut_ts": 0, |
| 185 | + "recent_events": [], |
| 186 | + } |
| 187 | + |
| 188 | + |
| 189 | +def update_paste_state(event, state): |
| 190 | + state = dict(default_paste_state() if state is None else state) |
| 191 | + client = event.get("client", {}) or {} |
| 192 | + ts_ms = timestamp_ms(event, client) |
| 193 | + |
| 194 | + if is_paste_keyboard(client): |
| 195 | + if ts_ms - state.get("last_paste_signal_ms", 0) <= DEDUP_MS: |
| 196 | + return False |
| 197 | + state["paste_count"] = state.get("paste_count", 0) + 1 |
| 198 | + state["last_paste_signal_ms"] = ts_ms |
| 199 | + state["awaiting_paste_until"] = ts_ms + PASTE_WAIT_MS |
| 200 | + state["recent_pastes"] = append_recent( |
| 201 | + state.get("recent_pastes"), |
| 202 | + {"timestamp_ms": ts_ms, "length": None, "source": "keyboard_signal"}, |
| 203 | + ) |
| 204 | + return state |
| 205 | + |
| 206 | + if looks_like_menu_paste(client): |
| 207 | + state["maybe_menu_paste_until"] = ts_ms + MENU_FLAG_MS |
| 208 | + return state |
| 209 | + |
| 210 | + if event_action(client) != "google_docs_save": |
| 211 | + return False |
| 212 | + |
| 213 | + inserted_text = extract_insert_from_gdocs_save(client) |
| 214 | + if not inserted_text: |
| 215 | + return False |
| 216 | + |
| 217 | + paste_length = len(inserted_text) |
| 218 | + awaiting_paste_until = state.get("awaiting_paste_until", 0) |
| 219 | + maybe_menu_paste_until = state.get("maybe_menu_paste_until", 0) |
| 220 | + counted_from_save = False |
| 221 | + |
| 222 | + if ts_ms <= maybe_menu_paste_until and ts_ms > awaiting_paste_until: |
| 223 | + if ts_ms - state.get("last_paste_signal_ms", 0) <= DEDUP_MS: |
| 224 | + return False |
| 225 | + state["paste_count"] = state.get("paste_count", 0) + 1 |
| 226 | + state["last_paste_signal_ms"] = ts_ms |
| 227 | + counted_from_save = True |
| 228 | + |
| 229 | + if ts_ms > awaiting_paste_until and not counted_from_save: |
| 230 | + return False |
| 231 | + |
| 232 | + state["pastes_with_length"] = state.get("pastes_with_length", 0) + 1 |
| 233 | + state["total_paste_chars"] = state.get("total_paste_chars", 0) + paste_length |
| 234 | + state["max_paste_len"] = max(state.get("max_paste_len", 0), paste_length) |
| 235 | + state["last_paste_len"] = paste_length |
| 236 | + if paste_length >= BIG_PASTE_THRESHOLD: |
| 237 | + state["big_pastes"] = state.get("big_pastes", 0) + 1 |
| 238 | + |
| 239 | + bin_name = paste_length_bin(paste_length) |
| 240 | + if bin_name != "none": |
| 241 | + state.setdefault("length_bins", {}) |
| 242 | + state["length_bins"][bin_name] = state["length_bins"].get(bin_name, 0) + 1 |
| 243 | + |
| 244 | + state["recent_pastes"] = append_recent( |
| 245 | + state.get("recent_pastes"), |
| 246 | + { |
| 247 | + "timestamp_ms": ts_ms, |
| 248 | + "length": paste_length, |
| 249 | + "source": "menu_inferred" if counted_from_save else "google_docs_save", |
| 250 | + }, |
| 251 | + ) |
| 252 | + state["awaiting_paste_until"] = 0 |
| 253 | + state["maybe_menu_paste_until"] = 0 |
| 254 | + return state |
| 255 | + |
| 256 | + |
| 257 | +def update_copy_cut_state(event, state): |
| 258 | + state = dict(default_copy_cut_state() if state is None else state) |
| 259 | + client = event.get("client", {}) or {} |
| 260 | + ts_ms = timestamp_ms(event, client) |
| 261 | + event_type = None |
| 262 | + |
| 263 | + if is_copy(client): |
| 264 | + if ts_ms - state.get("last_copy_ts", 0) <= DEDUP_MS: |
| 265 | + return False |
| 266 | + state["copy_count"] = state.get("copy_count", 0) + 1 |
| 267 | + state["last_copy_ts"] = ts_ms |
| 268 | + event_type = "copy" |
| 269 | + elif is_cut(client): |
| 270 | + if ts_ms - state.get("last_cut_ts", 0) <= DEDUP_MS: |
| 271 | + return False |
| 272 | + state["cut_count"] = state.get("cut_count", 0) + 1 |
| 273 | + state["last_cut_ts"] = ts_ms |
| 274 | + event_type = "cut" |
| 275 | + |
| 276 | + if not event_type: |
| 277 | + return False |
| 278 | + |
| 279 | + state["recent_events"] = append_recent( |
| 280 | + state.get("recent_events"), |
| 281 | + {"timestamp_ms": ts_ms, "event_type": event_type}, |
| 282 | + ) |
| 283 | + return state |
0 commit comments